<a href="https://colab.research.google.com/github/DataJenius/NLPEncodingExperiment/blob/main/python/NLPEncodingExperiment_get_word2vec_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
###########################################
# install all dependencies
!pip install datasets
!pip install gensim==3.8.3
!pip install transformers

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 22.0 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 50.9 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 43.3 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 55.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.8 MB/s 
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.2-cp37-c

In [None]:
#######################################################################################################
# define our model using StepByStep framework by Daniel Voigt Godoy
# https://pytorchstepbystep.com/
try:
    import google.colab
    import requests
    url = 'https://raw.githubusercontent.com/dvgodoy/PyTorchStepByStep/master/config.py'
    r = requests.get(url, allow_redirects=True)
    open('config.py', 'wb').write(r.content)    
except ModuleNotFoundError:
    pass
from config import *
config_chapter11()

Downloading files from GitHub repo to Colab...
Finished!


In [None]:
###########################################
# load all dependencies
import os
import pandas as pd
import numpy as np
from datasets import load_dataset, Split

import gensim
from gensim import corpora, downloader
from gensim.parsing.preprocessing import *
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim import downloader

from transformers import BertTokenizer

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset

from stepbystep.v4 import StepByStep

from google.colab import files

In [None]:
###############################################################################################
# embeddings available via Gensim
# https://github.com/RaRe-Technologies/gensim-data

# this is huge: vocab of 3,000,000 tokens, 1,662.8MB -- takes about 7 minutes
word2vec = downloader.load('word2vec-google-news-300')



In [None]:
###############################################################################################
# play with embeddings as an example
word2vec["king"]+word2vec["woman"]-word2vec["man"]
word2vec.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [None]:
###############################################################################################
# Generate our embedding vocabulary and save as local TXT file 
# using code from https://leanpub.com/pytorch
def make_vocab_from_wv(wv, folder=None, special_tokens=None):
    if folder is not None:
        if not os.path.exists(folder):
            os.mkdir(folder)

    words = wv.index2word
    if special_tokens is not None:
        to_add = []
        for special_token in special_tokens:
            if special_token not in words:
                to_add.append(special_token)
        words = to_add + words
                
    with open(os.path.join(folder, 'vocab.txt'), 'w') as f:
        for word in words:
            f.write(f'{word}\n') 

# run the function and make vocab on training data
make_vocab_from_wv(word2vec, 
                   'word2vec_vocab/', 
                   special_tokens=['[PAD]', '[UNK]'])            

In [None]:
###############################################################################################
# add special embeddings for missing tokens [PAD], [UNK]
# just a vector of 0's equal to the number of embedding dimensions (300 for word2vec)
special_embeddings = np.zeros((2, word2vec.vector_size))
extended_embeddings = np.concatenate([special_embeddings, word2vec.vectors], axis=0)

# hold special_embeddings in a tensor with embeddings 
extended_embeddings = torch.as_tensor(extended_embeddings).float()
torch_embeddings = nn.Embedding.from_pretrained(extended_embeddings)

In [None]:
###############################################################################################
# use the BERT tokenizer with our selected embedding vocab
word2vec_tokenizer = BertTokenizer('word2vec_vocab/vocab.txt')

In [None]:
#######################################################
# show how the tokenization works
new_sentences = ["I love Luke Skywalker, but I hate Gandalf the Grey.",                 
                 "I love Gandalf the Grey, but I hate Luke Skywalker."] 

# encode using the tokenizer
new_ids = word2vec_tokenizer(new_sentences, 
                              truncation=True,
                              padding=True, 
                              max_length=250, 
                              add_special_tokens=False, 
                              return_tensors='pt')['input_ids']

# evaluate the tokenization
print(new_sentences[0])
print(new_ids[0].squeeze().tolist())
print(word2vec_tokenizer.convert_ids_to_tokens(new_ids[0].squeeze().tolist()))
print("\n")
print(new_sentences[1])
print(new_ids[1].squeeze().tolist())
print(word2vec_tokenizer.convert_ids_to_tokens(new_ids[1].squeeze().tolist()))
print("\n")

I love Luke Skywalker, but I hate Gandalf the Grey.
[4503, 748, 442387, 1553862, 1, 35, 4503, 4295, 881832, 246699, 266393, 13, 653992, 331340, 1]
['i', 'love', 'luke', 'skywalker', '[UNK]', 'but', 'i', 'hate', 'ganda', '##l', '##f', 'the', 'gre', '##y', '[UNK]']


I love Gandalf the Grey, but I hate Luke Skywalker.
[4503, 748, 881832, 246699, 266393, 13, 653992, 331340, 1, 35, 4503, 4295, 442387, 1553862, 1]
['i', 'love', 'ganda', '##l', '##f', 'the', 'gre', '##y', '[UNK]', 'but', 'i', 'hate', 'luke', 'skywalker', '[UNK]']




In [None]:
#################################################################################
# our labelled, raw comment data is on github
all_files = ['https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group1.csv',
             'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group2.csv',
             'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group3.csv',
             'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group4.csv',
             'https://raw.githubusercontent.com/DataJenius/NLPEncodingExperiment/main/data/comments/selected/selected_reddit_comments_group5.csv']

# concat into a single dataframe and shuffle the contents
df_from_each_file = (pd.read_csv(f) for f in all_files)
df_all_data   = pd.concat(df_from_each_file, ignore_index=True)
df_all_data = df_all_data.sample(frac=1).reset_index(drop=True)
print(df_all_data.shape)
print(df_all_data.head())

(10000, 8)
   msg_id  token_count  my_group my_role  label       source  \
0   16859           13         5    test      0  /r/StarWars   
1   11549           13         1   train      0  /r/StarWars   
2   44109           16         5    test      1      /r/lotr   
3   10211           40         2   train      0  /r/StarWars   
4   34200           16         5    test      1      /r/lotr   

                                            raw_text  \
0  Yup, almost all guns in Star Wars are based on...   
1  No, that would be the Clone Wars animated movi...   
2  I mean that with the little we know thus far t...   
3  Seriously Robert Rodriguez is by far the worst...   
4  Frodo and Sam parting, and the Army of the Dea...   

                                          clean_text  
0  yup almost all guns in star wars are based on ...  
1  no that would be the clone wars animated movie...  
2  i mean that with the little we know thus far t...  
3  seriously robert rodriguez is by far the wor

In [None]:
####################################################################################################################
# get our word2vec sentence embeddings for all comments 
all_ids = word2vec_tokenizer(df_all_data['raw_text'].tolist(), 
                               truncation=True,
                               padding=True, 
                               max_length=500, 
                               add_special_tokens=False, 
                               return_tensors='pt')['input_ids']

# load all of our token embeddings for eac token_ids for each msg
token_embeddings = torch_embeddings(all_ids)
token_embeddings.shape

# get sentence embedding via boe_mean (Bag O' Embeddings)
boe_mean = nn.EmbeddingBag.from_pretrained(extended_embeddings, mode='mean')
sentence_vectors = boe_mean(all_ids)
sentence_vectors.shape

# format these 300 dimensional embeddings into a df with the msg_id
df_sentence_embeddings = pd.DataFrame(sentence_vectors.numpy())
df_sentence_embeddings["msg_id"]=df_all_data['msg_id']
print(df_sentence_embeddings.head())
print(df_sentence_embeddings.shape)

# save embeddings to local CSV
file_name = 'all_word2vec_embeddings.csv'
df_sentence_embeddings.to_csv(file_name, index=False) 
files.download(file_name)

          0         1         2         3         4         5         6  \
0  0.003339  0.003475  0.004821  0.008059 -0.003259 -0.000954  0.001748   
1  0.005939  0.004278 -0.000164  0.006519 -0.000781 -0.000419  0.000534   
2  0.000456  0.004279  0.000278  0.009820 -0.013631  0.001497 -0.000738   
3  0.002580  0.008441  0.001129  0.022415 -0.010199 -0.004328  0.000669   
4  0.000863  0.006457  0.001117  0.006546  0.001303 -0.007947 -0.004253   

          7         8         9  ...       291       292       293       294  \
0 -0.006635  0.003307  0.008280  ...  0.000618 -0.003854  0.004181 -0.002837   
1 -0.006690  0.007150  0.005826  ... -0.000449 -0.005705  0.002481 -0.000282   
2 -0.008488  0.008086  0.011036  ...  0.001485 -0.007454  0.005443 -0.000002   
3 -0.020939  0.018410  0.016906  ...  0.007107 -0.016543  0.016347 -0.007113   
4 -0.013011  0.006457  0.014033  ... -0.003262 -0.011293  0.006958 -0.003605   

        295       296       297       298       299  msg_id  
0 -0.0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>