In [31]:
!pip install transformers



## Import Libraries

In [32]:
import torch
import numpy as np
import pandas as pd


## Import Datasets and Analysis

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
DATASET = 'test'                           # train, valid, test
FILE = f'/content/drive/MyDrive/Own Projects/{DATASET}.csv'
BATCHSIZE = 800                            # 3000, 400, 800    (bc colab free version does crash with more samples)
OUTPUTFILE = f'/content/drive/MyDrive/Own Projects/embeddings-{DATASET}.h5'

df = pd.read_csv(FILE, index_col=0)         # use original index as index
print('number of reviews:', df.shape[0])
df.head()

number of reviews: 87109


Unnamed: 0,business_id,date,text,useful,useful_normalised,business_max_usefulness,review_stars,latitude,longitude,business_stars,review_count
216364,5xdd5KXz1TXmRI47M-VO2A,2016-01-14 22:39:14,Great mexican in Indy. Who would have thought....,2,0.333333,6,5.0,39.769536,-86.156351,4.0,34
216365,o3RGMdSY37_PX1k1vZsXpA,2018-04-11 16:08:19,Awesome food. Location is a bit funky as it ...,0,0.0,29,4.0,27.894854,-82.7137,4.5,541
216370,kZ3L75t_7EqE9kRS6bpWSA,2016-12-13 04:54:55,"For me, I first moved here, I tried to find a ...",0,0.0,14,5.0,39.518405,-119.883758,3.5,72
216379,kQRB8RNwd7cEMcURTS-I7A,2016-07-11 17:18:32,Great brunch spot! Came here with a group of ...,2,0.040816,49,4.0,39.915955,-75.068428,4.0,570
216381,WHRrhggfj5x3URXVrFqVcA,2007-04-10 18:11:31,"For truly authentic Italian food, Cortina shin...",0,0.0,1,4.0,39.529722,-119.812778,3.0,5


## Creating Embeddings of Review Text

Helper Functions

In [35]:
import h5py
from transformers import AutoTokenizer, DistilBertModel

def create_embeddings(tokenizer, model, df, BATCHSIZE):

  chunk = df.iloc[:BATCHSIZE]
  tokens = tokenizer(chunk['text'].tolist(), truncation=True, max_length=95, padding=True, return_tensors='pt')
  outputs = model( **{k: v.to(device) for k, v in tokens.items()})  # first moving token values to gpu

  CLT_embeddings = outputs[0][:, 0].cpu().detach().numpy()          # output[0] is output of last hidden layer, and we take the CLT embedding vector, which summarizes the whole review as a vector

  torch.cuda.empty_cache()

  return CLT_embeddings


def create_hdf5file(CLT_embeddings, BATCHSIZE, OUTPUTFILE):

  chunk = df.iloc[:BATCHSIZE]

  with h5py.File(OUTPUTFILE, 'w') as hf:
    hf.create_dataset('embeddings', data=embeddings_chunk, dtype='f')
    hf.create_dataset('useful_normalised', data=chunk['useful_normalised'].values, dtype='f')


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

Define tokenizer and model

In [36]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
model.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

Create embeddings and save as Hdf5 file/format

In [37]:
with torch.no_grad():
  embeddings_chunk = create_embeddings(tokenizer, model, df, BATCHSIZE)
  create_hdf5file(embeddings_chunk, BATCHSIZE, OUTPUTFILE)

In [38]:
f = h5py.File(OUTPUTFILE, 'r')

In [39]:
f['embeddings']

<HDF5 dataset "embeddings": shape (800, 768), type "<f4">

In [40]:
f['embeddings'][0]

array([-6.59827217e-02, -1.63922459e-01, -6.06588554e-04, -7.01348335e-02,
        9.03212503e-02, -1.73280522e-01,  4.73734401e-02,  5.73356330e-01,
        1.07334837e-01, -1.43751070e-01,  1.86554044e-01,  1.18975930e-01,
        1.29541755e-01,  2.75732189e-01, -1.27059445e-01,  1.69593185e-01,
        1.96276620e-01,  2.97825158e-01,  1.38725817e-01, -7.57751539e-02,
        1.38764843e-01, -5.46477556e-01, -1.68276895e-02,  1.66501731e-01,
        1.55774495e-02,  3.61850820e-02, -2.71454304e-01, -8.96358490e-02,
       -1.76718771e-01,  1.34523839e-01,  1.82069987e-01,  2.92969406e-01,
       -1.02987826e-01, -4.00088042e-01,  1.56424075e-01, -2.37292856e-01,
        3.23913485e-01, -2.37663433e-01, -5.58190830e-02,  2.59481966e-01,
       -1.18270390e-01,  2.36597434e-01,  3.29267949e-01,  9.93981063e-02,
        1.71545789e-01, -2.80891638e-02, -2.72225642e+00,  5.33645153e-02,
       -2.65951216e-01, -2.81423420e-01,  6.38395131e-01, -1.65328771e-01,
       -2.00617298e-01,  

In [44]:
f['embeddings'].shape[0]

800

In [46]:
np.array(f['useful_normalised'])

array([0.33333334, 0.        , 0.        , 0.04081633, 0.        ,
       0.        , 0.        , 0.09090909, 1.        , 0.        ,
       0.02222222, 0.25      , 0.06666667, 0.11764706, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.125     ,
       1.        , 1.        , 0.25      , 0.09090909, 0.        ,
       1.        , 0.1       , 0.06976745, 0.125     , 0.11764706,
       0.07142857, 0.04166667, 0.        , 0.14285715, 0.        ,
       0.        , 0.        , 0.5       , 0.        , 1.        ,
       0.        , 0.        , 0.07142857, 0.16666667, 0.        ,
       0.16666667, 0.2       , 0.        , 0.4       , 0.        ,
       0.11111111, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.01265823, 0.        , 0.        , 0.        ,
       0.04545455, 0.        , 0.        , 0.15      , 0.05      ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.06666667, 0.        , 0.4       , 1.        , 0.25   