#### Word2Vec

GOAL: Create (or import) unstructured text and use Word2Vec to convert into vectors. 

In [None]:
# libraries

import pandas as pd

import json
import nltk
from nltk.corpus import stopwords

import string

from textblob import Word

from gensim.models import Word2Vec

In [None]:
#import data

json_file ='intents.json'
with open('intents.json','r') as f:
    data = json.load(f)

In [None]:
data

series of patterns and responses

In [None]:
len(data) # check number of intents

In [None]:
#convert to dataframe

df = pd.DataFrame(data)
df['patterns'] = df['patterns'].apply(', '.join) 

In [None]:
df # quick check

In [None]:
# wrangling - cleanup the text

# identify stopwords
stop = stopwords.words('english') 

# convert all words to lower case
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x.lower() for x in x.split()))

#filter out string punctuation
df['patterns'] = df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))

#remove numbers or "." using a regular expression                                        
df['patterns'] = df['patterns'].str.replace('[^\w\s]','')
                                        
# remove digits                                      
df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if  not x.isdigit()))

# # now remove stop words                                        
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop))

# # lemmatization used to remove different forms of the same word
df['patterns'] = df['patterns'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [None]:
#check
df

In [None]:
#check
df['patterns']

In [None]:
# build a model using Word2Vec

bigger_list=[] # create an empty list

# loop thru each entry in df['patterns'] and split it into words
# append these to the "bigger_list"
for i in df['patterns']:
    li = list(i.split(" "))
    bigger_list.append(li)
model= Word2Vec(bigger_list,min_count=1,size=300,workers=4)

In [None]:
bigger_list # check word groupings to see if they make sense

In [None]:
# display the vocabularly from the model
vocab = list(model.wv.vocab)
vocab

In [None]:
# view vector for a specific word in the vocab list
model.wv["goodbye"]

#### Exercise

create a visualisation of the word embeddings

In [None]:
# STEP 1 
# store all the word vectors (vocab) in a data frame

X=model[vocab]
vocab_df=pd.DataFrame(X, index = vocab)
vocab_df.head()

In [None]:
# check vocab length
len(vocab_df)

In [None]:
# STEP 2
# collapse data using PCA

import numpy as np

#Computing the correlation matrix
X_corr=vocab_df.corr()

#Computing eigen values and eigen vectors
values,vectors=np.linalg.eig(X_corr)

#Sorting the eigen vectors coresponding to eigen values in descending order
args = (-values).argsort()
values = vectors[args]
vectors = vectors[:, args]

#Taking first 2 components which explain maximum variance for projecting
new_vectors=vectors[:,:2]

#Projecting it onto new dimension with 2 axis
neww_X=np.dot(X,new_vectors)

In [None]:
# STEP 3 visualise the words

import matplotlib.pyplot as plt

plt.figure(figsize=(13,7))
plt.scatter(neww_X[:,0],neww_X[:,1],linewidths=10,color='blue')
plt.xlabel("PC1",size=15)
plt.ylabel("PC2",size=15)
plt.title("Word Embedding Space",size=20)
vocab=list(model.wv.vocab)

for i, word in enumerate(vocab):
  plt.annotate(word,xy=(neww_X[i,0],neww_X[i,1]))

Source references with amendments:

https://www.guru99.com/word-embedding-word2vec.html

https://towardsdatascience.com/visualization-of-word-embedding-vectors-using-gensim-and-pca-8f592a5d3354