In [34]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.utils import to_categorical, plot_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import cufflinks as cf; cf.go_offline()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
#%pip install pydot

In [35]:
wine_reviews = pd.read_csv("data/wine_reviews.csv", index_col=0)
wine_reviews.shape

(119924, 8)

In [3]:
wine_reviews.columns

Index(['country', 'description', 'points', 'price', 'province', 'taster_name',
       'variety', 'year'],
      dtype='object')

In [36]:
wine_reviews.variety.fillna("NA", inplace=True)
wine_reviews.taster_name.fillna("NA", inplace=True)
wine_reviews.year.fillna(wine_reviews.year.min(),inplace=True)
wine_reviews.price.fillna(wine_reviews.price.median(), inplace=True)

In [37]:
wine_reviews.sample(5)

Unnamed: 0,country,description,points,price,province,taster_name,variety,year
115411,US,Fruity and forward with ripe blackberry and ch...,88,25.0,New York,Anna Lee C. Iijima,Red Blend,2010.0
109678,US,Menthol and pepper ride over dark cherry and s...,85,20.0,Virginia,Alexander Peartree,Bordeaux-style Red Blend,2013.0
53765,Germany,Sharp acidity offsets juicy white peach and ap...,90,19.0,Pfalz,Anna Lee C. Iijima,Riesling,2015.0
31885,US,"Concannon's Reserve Petite swirls in plum, dar...",87,40.0,California,Virginie Boone,Petite Sirah,2008.0
107120,Italy,A blend of 60% Sangiovese and 40% Cabernet Sau...,89,65.0,Tuscany,Kerin O’Keefe,Red Blend,2009.0


In [4]:
wine_reviews[wine_reviews.year.isna()]

Unnamed: 0,country,description,points,price,province,taster_name,variety,year


In [38]:
text_col = ['description']
price_col_name = 'price'
numerical_col = [price_col_name]
categorical_col = ['country','province','taster_name','variety','year']

X = wine_reviews[['description','price','country','province','taster_name','variety', 'year']].copy()
y = wine_reviews['points'].copy()

In [39]:
# Convert categorical features to one-hot encoding
for col in categorical_col:
    X[col] = pd.Categorical(X[col])
    X[col] = X[col].cat.codes

In [68]:
X.sample(5)

Unnamed: 0,description,price,country,province,taster_name,variety,year
102242,"New leather, wild berry, violet, white pepper ...",58.0,22,286,9,382,71
127785,Pungent aromas of field greens and citrus lead...,19.0,15,195,14,511,60
51628,From a very good if not great Champagne vintag...,139.0,15,75,16,121,67
16233,Aromas include dry apricot and papaya with a h...,18.0,37,132,12,7,71
76913,Produced in partnership between Schulz Cellars...,18.0,40,51,19,223,72


In [9]:
X.year.min()

1821.0

In [73]:
X[X.year.isnull()]

Unnamed: 0,description,price,country,province,taster_name,variety,year


In [40]:
X['price'] = np.log(X['price'])
scaler = StandardScaler()
X[['price']] = scaler.fit_transform(X[['price']].values)

In [22]:
X.sample(5)

Unnamed: 0,description,price,country,province,taster_name,variety,year
11208,Very terroir-driven from chalk soil in this gr...,2.384253,15,47,16,437,2011.0
26805,"Tangy and showing some good aging, this is a r...",-0.05216,31,294,16,446,1821.0
121534,"A new wine for Aegerter, this is ripe and full...",-0.838306,15,183,16,125,2013.0
121029,"Toffee, exotic spice, vanilla, raspberry and c...",0.73493,40,411,17,696,2012.0
43754,"This wine is dense, tough and dark—a black plu...",-0.05216,31,294,16,446,2010.0


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
country_len = wine_reviews.country.nunique()
province_len = wine_reviews.province.nunique()
tester_len = wine_reviews.taster_name.nunique()
variety_len = wine_reviews.variety.nunique()
year_len  = wine_reviews.year.nunique()

### Model without textual columns

In [9]:
print(f'country:{country_len}  province {province_len}  tester {tester_len}  variety {variety_len}  year {year_len}')

country:43  province 425  tester 20  variety 701  year 78


In [10]:
# Define the graph model
# Categorical inputs
# country
cat_input_country = Input(shape=(1,))
embedding_country = Embedding(input_dim=country_len, output_dim=7)(cat_input_country)
flat_embed_country = Flatten()(embedding_country)

#province
cat_input_province = Input(shape=(1,))
embedding_province = Embedding(input_dim=province_len, output_dim=22)(cat_input_province)
flat_embed_province = Flatten()(embedding_province)

#taster_name
cat_input_tester = Input(shape=(1,))
embedding_tester = Embedding(input_dim=tester_len, output_dim=5)(cat_input_tester)
flat_embed_tester = Flatten()(embedding_tester)

#variety
cat_input_variety = Input(shape=(1,))
embedding_variety = Embedding(input_dim=variety_len, output_dim=30)(cat_input_variety)
flat_embed_variety = Flatten()(embedding_variety)

#year
cat_input_year = Input(shape=(1,))
embedding_year = Embedding(input_dim=year_len, output_dim=9)(cat_input_year)
flat_embed_year = Flatten()(embedding_year)


# Numerical inputs
num_input = Input(shape=(1,))

# Concatenate categorical embeddings with numerical inputs
concatenated = Concatenate()([flat_embed_country, flat_embed_province, flat_embed_tester, flat_embed_variety, num_input])
#concatenated = Concatenate()([flat_embed_country, flat_embed_province, flat_embed_tester, flat_embed_variety, flat_embed_year, num_input])


# Dense layers for classification
x = Dense(16, activation='relu')(concatenated)
x = Dense(8, activation='relu')(x)
outputs = Dense(1, activation="sigmoid")(x)

# Create the model
model = Model(inputs=[cat_input_country, cat_input_province, cat_input_tester, cat_input_variety, num_input], outputs=outputs)
#model = Model(inputs=[cat_input_country, cat_input_province, cat_input_tester, cat_input_variety, cat_input_year, num_input], outputs=outputs)


In [116]:
#model.summary()

In [11]:
plot_model(model, show_dtype=True, show_shapes=True, show_layer_names=True, to_file='model_1.png')

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [None]:
# Compile the model

categorical_col = ['country','province','taster_name','variety']

train_X_data = [X_train['country'], X_train['province'], X_train['taster_name'], X_train['variety'], X_train[[price_col_name]]]
#train_X_data = [X_train['country'], X_train['province'], X_train['taster_name'], X_train['variety'], X_train[[price_col_name]]]

model.compile(optimizer=Adam(learning_rate=0.01),
              loss='mean_squared_error',
              metrics=[Accuracy()])

# Train the model
model.fit(train_X_data, y_train,
          batch_size=64,
          epochs=10)

# Evaluate the model
loss, accuracy = model.evaluate(train_X_data, y_train)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


In [31]:
y_train_actual = model.predict(train_X_data)



In [44]:
test_X_data = [X_test['country'], X_test['province'], X_test['taster_name'], X_test['variety'], X_test[[price_col_name, 'year']]]
y_test_actual = model.predict(test_X_data)



In [45]:
mse_train = mean_squared_error(y_train, y_train_actual)
r2_train = r2_score(y_train, y_train_actual)
mse_test = mean_squared_error(y_test, y_test_actual)
r2_test = r2_score(y_test, y_test_actual)
print("Mean squared error train: %.2f" % mse_train)
# The coefficient of determination: 1 is perfect prediction
print("R2: %.2f train" % r2_train)
print("Mean squared error test: %.2f" % mse_test)
# The coefficient of determination: 1 is perfect prediction
print("R2: %.2f test" % r2_test)

Mean squared error train: 9.64
R2: -0.01 train
Mean squared error test: 8073.05
R2: -842.96 test


In [113]:
predicted_df1 = pd.DataFrame()
predicted_df1["actual"] = y_test
predicted_df1["predicted"] = y_test_actual
predicted_df1["price"] = X_test.price
predicted_df1['error'] = abs(predicted_df1["actual"] - predicted_df1['predicted'])
predicted_df1.sort_values(by="error", ascending=False).head(10)


Unnamed: 0,actual,predicted,price,error
114972,100,-1.369088,4.975731,101.369088
123545,100,-1.367253,1.684146,101.367253
111758,99,-1.383423,-0.143399,100.383423
116094,99,-1.383141,-0.143399,100.383141
118059,99,-1.375552,0.744823,100.375552
47894,99,-1.373672,1.582743,100.373672
114973,99,-1.373182,3.583668,100.373182
45573,99,-1.367248,2.91402,100.367248
79104,99,-1.366719,3.474429,100.366719
56043,99,-1.36415,4.311846,100.36415


In [115]:
predicted_df = pd.DataFrame()
predicted_df["actual"] = y_train
predicted_df["predicted"] = y_train_actual
predicted_df["price"] = X_train.price
predicted_df['error'] = abs(predicted_df["actual"] - predicted_df['predicted'])
predicted_df.sort_values(by="error", ascending=False).head(20)


Unnamed: 0,actual,predicted,price,error
24646,87,50.29829,-1.583078,36.70171
45798,100,88.212967,3.123825,11.787033
111754,100,88.212967,3.59535,11.787033
42197,100,88.212967,4.397961,11.787033
7335,100,88.212967,3.200484,11.787033
345,100,88.212967,4.003095,11.787033
111756,100,88.212967,4.042987,11.787033
111753,100,88.212967,6.289646,11.787033
118058,100,88.212967,4.397961,11.787033
39286,100,88.212967,4.432494,11.787033


In [43]:
results_df = pd.DataFrame(columns=['name','classifier','alpha','reduction','train_r2','train_mse', 'test_r2', 'test_mse'])

def append_to_results(name, classifier, alpha, reduction, y_test, y_predict, y_train, y_train_predicted):
    test_r2 = r2_score(y_test, y_predicted), 
    test_mse = mean_squared_error(y_test, y_predicted)
    train_r2 = r2_score(y_train, y_train_predicted)
    train_mse = mean_squared_error(y_train, y_train_predicted)
    
    return results_df.append({'name':name, 'classifier':classifier, 'alpha': alpha, 'reduction':reduction, 'test_r2': test_r2, 'test_mse':test_mse, 'train_r2': train_r2, "train_mse": train_mse}, ignore_index=True)

def append_row_to_results(name, classifier, alpha, reduction, test_r2, test_mse, train_r2, train_mse):
    return results_df.append({'name':name, 'classifier':classifier, 'alpha': alpha, 'reduction':reduction, 'test_r2': test_r2, 'test_mse':test_mse, 'train_r2': train_r2, "train_mse": train_mse}, ignore_index=True)


In [None]:
append_to_results("Deep Learning, categorial and numerical features", 

## Model With Text

In [42]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6
from tensorflow.keras import layers

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
X_train.sample(5)

Unnamed: 0,description,price,country,province,taster_name,variety,year
28171,This falls sbetween a rosé and an orange wine ...,0.034663,40,268,15,433,74
25907,"Tight and structured, this wine has minerality...",-0.659546,15,47,16,125,74
86353,Prominent yet not overpowering oak smoke aroma...,-0.143399,40,51,11,125,72
87206,This wine is made to be enjoyed young and frui...,-1.170851,31,108,16,447,73
91790,Just too raisiny and stewed for real satisfact...,-0.417344,40,51,14,325,64


In [43]:
max_vocab_length = 11000 # max words to have in vocabulary
max_length = 40 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [44]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, #how long is each input
                             name="embedding_1") 

embedding

<keras.layers.core.embedding.Embedding at 0x17be92be0a0>

In [57]:
# Define the graph model
# Categorical inputs
# country
cat_input_country = Input(shape=(1,))
embedding_country = Embedding(input_dim=country_len, output_dim=7)(cat_input_country)
flat_embed_country = Flatten()(embedding_country)

#province
cat_input_province = Input(shape=(1,))
embedding_province = Embedding(input_dim=province_len, output_dim=22)(cat_input_province)
flat_embed_province = Flatten()(embedding_province)

#taster_name
cat_input_tester = Input(shape=(1,))
embedding_tester = Embedding(input_dim=tester_len, output_dim=5)(cat_input_tester)
flat_embed_tester = Flatten()(embedding_tester)

#variety
cat_input_variety = Input(shape=(1,))
embedding_variety = Embedding(input_dim=variety_len, output_dim=30)(cat_input_variety)
flat_embed_variety = Flatten()(embedding_variety)

#year
cat_input_year = Input(shape=(1,))
embedding_year = Embedding(input_dim=year_len, output_dim=9)(cat_input_year)
flat_embed_year = Flatten()(embedding_year)

#text
text_input = Input(shape=(1,), dtype="string")

text_vector = text_vectorizer(text_input)
text_embedding = embedding(text_vector)
text_embedding = layers.LSTM(64)(text_embedding)


# Numerical inputs
num_input = Input(shape=(1,))

# Concatenate categorical embeddings with numerical inputs
concatenated = Concatenate()([flat_embed_country, flat_embed_province, flat_embed_tester, flat_embed_variety, text_embedding, num_input])
#concatenated = Concatenate()([flat_embed_country, flat_embed_province, flat_embed_tester, flat_embed_variety, flat_embed_year, num_input])


# Dense layers for classification
x = Dense(16, activation='relu')(concatenated)
x = Dense(8, activation='relu')(x)
outputs = Dense(1, activation="sigmoid")(x)

# Create the model
model = Model(inputs=[cat_input_country, cat_input_province, cat_input_tester, cat_input_variety, text_input, num_input], outputs=outputs)
#model = Model(inputs=[cat_input_country, cat_input_province, cat_input_tester, cat_input_variety, cat_input_year, num_input], outputs=outputs)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_28 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_23 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_24 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_25 (InputLayer)          [(None, 1)]          0           []                               
                                                                                            

In [54]:
model.compile(loss=tf.keras.losses.mse,
                          optimizer=tf.keras.optimizers.Adam(),
                          metrics=['mse'])

In [None]:
train_X_data = [X_train['country'], X_train['province'], X_train['taster_name'], X_train['variety'], np.array(X_train['description']), X_train[[price_col_name]]]
#train_X_data = [X_train['country'], X_train['province'], X_train['taster_name'], X_train['variety'], X_train[[price_col_name]]]

#model.compile(optimizer=Adam(learning_rate=0.01),loss='mean_squared_error',metrics=[Accuracy()])

# Train the model
model.fit(train_X_data, y_train,
          batch_size=64,
          epochs=10)

# Evaluate the model
loss, accuracy = model.evaluate(train_X_data, y_train)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


## Try very simple model

In [130]:
#variety
cat_input_variety = Input(shape=(1,))
embedding_variety = Embedding(input_dim=X.variety.nunique(), output_dim=8)(cat_input_variety)
flat_embed_variety = Flatten()(embedding_variety)

# Numerical inputs
num_input = Input(shape=(1,))

# Concatenate categorical embeddings with numerical inputs
concatenated = Concatenate()([flat_embed_variety, num_input])

# Dense layers for classification
x = Dense(32, activation='tanh')(concatenated)
x = Dense(16, activation='tanh')(x)
outputs = Dense(1, activation="linear")(x)

# Create the model
model = Model(inputs=[cat_input_variety, num_input], outputs=outputs)
model.summary()


Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_73 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 embedding_59 (Embedding)       (None, 1, 8)         5608        ['input_73[0][0]']               
                                                                                                  
 flatten_59 (Flatten)           (None, 8)            0           ['embedding_59[0][0]']           
                                                                                                  
 input_74 (InputLayer)          [(None, 1)]          0           []                               
                                                                                           

In [18]:
data = [X_train['variety'], X_train[numeric_col_name]]
data

[104748    703
 101219    561
 82261      80
 36717     705
 128031     62
          ... 
 128106    480
 103694    430
 860        71
 15795      62
 121958    474
 Name: variety, Length: 103976, dtype: int16,
 104748   -0.312389
 101219   -0.261854
 82261     4.791597
 36717    -0.590329
 128031   -0.312389
             ...   
 128106   -0.489260
 103694   -0.135518
 860       0.243491
 15795     2.896553
 121958    0.041353
 Name: price, Length: 103976, dtype: float64]

In [None]:
X_train_small = X_train.head(100000) 

model.compile(optimizer=Adam(),
              loss='mean_squared_error',
              metrics=[Accuracy()])
data = [X_train_small[ X_train_small['variety'],  X_train_small[numerical_col]]]
# Train the model
model.fit( data, y_train.head(100000),
          batch_size=16,
          epochs=10)

# Evaluate the model
loss, accuracy = model.evaluate(data, y_train.head(100000))
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


In [None]:
y_predict = model.predict(X_test)

In [17]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel, RobertaModel, RobertaTokenizer

In [None]:
%%time 
# Text to vectorize
text = "Here is an example paragraph that we will convert into an embedding."

# Add special tokens for BERT (start and end of sentence)
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence
tokenized_text = tokenizer.tokenize(marked_text)

# Map tokens to their index in the tokenizer vocabulary
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert list to torch tensor
  = torch.tensor([indexed_tokens])

# Put everything on the GPU if available and run through the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokens_tensor = tokens_tensor.to(device)
model = model.to(device)

with torch.no_grad():
    outputs = model(tokens_tensor)
    # The first element of outputs is the last layer of the model, which can be used as embeddings.
    embeddings = outputs[0]

# Calculate the mean to get sentence vector
mean_embeddings = torch.mean(embeddings, dim=1).cpu().numpy()

print(mean_embeddings.flatten()[:50])

In [None]:
#%pip install "torch>=2.0" --extra-index-url https://download.pytorch.org/whl/cu117 --upgrade --quiet

In [None]:
#!pip install "transformers==4.27.1" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" tensorboard scikit-learn --upgrade --quiet

In [21]:
#%pip install transformer --upgrade

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement transformer (from versions: none)
ERROR: No matching distribution found for transformer


In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel, RobertaModel, RobertaTokenizer

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [3]:
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
%%time 
# Text to vectorize
text = "Here is an example paragraph that we will convert into an embedding."

# Add special tokens for BERT (start and end of sentence)
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

# Map tokens to their index in the tokenizer vocabulary
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print(indexed_tokens)

# Convert list to torch tensor
tokens_tensor  = torch.tensor([indexed_tokens])
print(t)


['[CLS]', 'here', 'is', 'an', 'example', 'paragraph', 'that', 'we', 'will', 'convert', 'into', 'an', 'em', '##bed', '##ding', '.', '[SEP]']
[101, 2182, 2003, 2019, 2742, 20423, 2008, 2057, 2097, 10463, 2046, 2019, 7861, 8270, 4667, 1012, 102]
tensor([[  101,  2182,  2003,  2019,  2742, 20423,  2008,  2057,  2097, 10463,
          2046,  2019,  7861,  8270,  4667,  1012,   102]])
Wall time: 930 µs


In [21]:
# Put everything on the GPU if available and run through the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokens_tensor = tokens_tensor.to(device)
model = model.to(device)

with torch.no_grad():
    outputs = model(tokens_tensor)
    # The first element of outputs is the last layer of the model, which can be used as embeddings.
    embeddings = outputs[0]

# Calculate the mean to get sentence vector
mean_embeddings = torch.mean(embeddings, dim=1).cpu().numpy()

print(mean_embeddings.flatten()[:20])

[-0.05517262 -0.05596118  0.2387126  -0.26662704  0.07532007  0.06179006
  0.67365825 -0.04533741 -0.32682377  0.13599311  0.40212578 -0.21838169
 -0.05768028  0.3261929  -0.22031818  0.39030343  0.08599138 -0.03073616
  0.4642036  -0.12418564]


In [9]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

In [10]:
# Load pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [18]:
# Text to vectorize
text = "Here is an example paragraph that we will convert into an embedding. it might help me."

# Tokenize our sentence
input_ids = tokenizer.encode(text, return_tensors='pt')

# Run through the model
outputs = model(input_ids)
# The first ripprelement of outputs is the last layer of the model, which can be used as embeddings.
embeddings = outputs[0]

# Calculate the mean to get sentence vector
mean_embeddings = torch.mean(embeddings, dim=1).cpu().detach().numpy()

In [None]:
def get_embeddings(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # Run through the model
    outputs = model(input_ids)
    # The first ripprelement of outputs is the last layer of the model, which can be used as embeddings.
    embeddings = outputs[0]

    # Calculate the mean to get sentence vector
    mean_embeddings = torch.mean(embeddings, dim=1).cpu().detach().numpy()
    return mean_embeddings

In [26]:
print(input_ids.flatten())
print(embeddings.flatten())
print(embeddings.shape)
print(mean_embeddings.shape)

tensor([ 4342,   318,   281,  1672,  7322,   326,   356,   481, 10385,   656,
          281, 11525, 12083,    13,   340,  1244,  1037,   502,    13])
tensor([-0.2551, -0.1230, -0.2618,  ...,  0.1842, -0.0650,  0.2318],
       device='cuda:0')
torch.Size([1, 17, 768])
(1, 768)
