In [2]:
import gensim

In [12]:
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/franzv/googleNews_vectors/GoogleNews-vectors-negative300.bin', binary=True)  

In [13]:
vector = model['easy'] #find numeric vector for a word
vector.shape

(300,)

In [14]:
# find most similar word to a word
model.most_similar('nice')

[('good', 0.6836091876029968),
 ('lovely', 0.6676310896873474),
 ('neat', 0.6616737246513367),
 ('fantastic', 0.6569241881370544),
 ('wonderful', 0.6561347246170044),
 ('terrific', 0.6552367806434631),
 ('great', 0.6454657912254333),
 ('awesome', 0.6404187083244324),
 ('nicer', 0.6302445530891418),
 ('decent', 0.5993332266807556)]

In [15]:
model.similarity('nice','good') # find similarity between two words

0.68360907

In [16]:
# antonyms are also highly similar - can sub with each other
model.similarity('bad','good')

0.7190052

In [17]:
# relationships between multiple words
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411403656006)]

In [19]:
model.most_similar(positive=['girl', 'dad'], negative=['mom'])

[('boy', 0.808031439781189),
 ('teenager', 0.6755870580673218),
 ('teenage_girl', 0.6386616826057434),
 ('man', 0.6255338191986084),
 ('lad', 0.616614043712616),
 ('schoolgirl', 0.6113480925559998),
 ('schoolboy', 0.6011566519737244),
 ('son', 0.5938458442687988),
 ('father', 0.5887871384620667),
 ('uncle', 0.5734449028968811)]

In [20]:
model.most_similar(positive=['paris','spain'], negative=['france'])

[('madrid', 0.5295541286468506),
 ('dubai', 0.509259819984436),
 ('heidi', 0.48901548981666565),
 ('portugal', 0.48763689398765564),
 ('paula', 0.48557141423225403),
 ('alex', 0.480734646320343),
 ('lohan', 0.4801103472709656),
 ('diego', 0.48010095953941345),
 ('florence', 0.47695302963256836),
 ('costa', 0.4761490225791931)]

In [21]:
model.most_similar(positive=['chair','mother'], negative=['table'])

[('daughter', 0.6066097021102905),
 ('niece', 0.5490824580192566),
 ('granddaughter', 0.540050745010376),
 ('aunt', 0.5397382974624634),
 ('husband', 0.5387389659881592),
 ('sister', 0.5360148549079895),
 ('son', 0.5356959104537964),
 ('wife', 0.5313628911972046),
 ('father', 0.5261732339859009),
 ('grandmother', 0.5253341197967529)]

In [None]:
# includes some stop words but not all
# includes misspelled words
# commonly used words
# includes ### to match digits, and slightly messy with some non-sense words

CREATING BoW and TfIdf from scratch
* goal: read text > preprocess > create data > one hot encoded matrix > train neural network > extract weights
* word embeddings - turns words to vectors, and allows to see similar words in a semantic sense
* pretrained NN have vectors for words (one input layer, one hidden, one output) - determines similarity
* end with pairs of focus and context words with windows
* creating training matrices for word embeddings
    * hyperparameters:
        * window size of context (w) - how many words to look forward and back
        * create pairs for each 'focus' word and 'context' word
* [link](https://towardsdatascience.com/creating-word-embeddings-coding-the-word2vec-algorithm-in-python-using-deep-learning-b337d0ba17a8)

In [22]:
## creating training matrix for word embedding from scratch 
import re

def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will']) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string

In [25]:
# GENERAL PIPELINE FOR CREATING WORD PAIRS GIVEN A LIST OF STRINGS 
# # Defining the window for context
# window = 2

# # Creating a placeholder for the scanning of the word list
# word_lists = []
# all_text = []

# for text in texts:

#     # Cleaning the text
#     text = text_preprocessing(text)

#     # Appending to the all text list
#     all_text += text 

#     # Creating a context dictionary
#     # for i, word in enumerate(text):
#         for w in range(window):
#             # Getting the context that is ahead by *window* words
#             if i + 1 + w < len(text): 
#                 word_lists.append([word] + [text[(i + 1 + w)]])
#             # Getting the context that is behind by *window* words    
#             if i - w - 1 >= 0:
#                 word_lists.append([word] + [text[(i - w - 1)]])

* Creating index for each unique word, after initial creation of data points
* used later for one-hot encoding vectors

In [26]:
def create_unique_word_dict(text:list) -> dict:
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    words = list(set(text))
    words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word: i
        })

    return unique_word_dict 

In [31]:
temp_text = ["word", "another", "king", "prince"]

In [32]:
create_unique_word_dict(temp_text)

{'another': 0, 'king': 1, 'prince': 2, 'word': 3}

In [33]:
# transform data points using one hot encoding 
# vector size will be equal to a number of unique words in the document 
# code below gets the index, and adds a 1 for when it occurs
# e.g. 'blue' = [1, 0, 0]
# 'car' = [0, 1, 0]
# 'sky' = [0, 0, 1]

In [None]:
# # creating two matrices - one for X (focus words) - one for Y (context words)
# from scipy import sparse
# import numpy as np

# # Defining the number of features (unique words)
# n_words = len(unique_word_dict)

# # Getting all the unique words 
# words = list(unique_word_dict.keys())

# # Creating the X and Y matrices using one hot encoding
# X = []
# Y = []

# for i, word_list in tqdm(enumerate(word_lists)):
#     # Getting the indices
#     main_word_index = unique_word_dict.get(word_list[0])
#     context_word_index = unique_word_dict.get(word_list[1])

#     # Creating the placeholders   
#     X_row = np.zeros(n_words)
#     Y_row = np.zeros(n_words)

#     # One hot encoding the main word
#     X_row[main_word_index] = 1

#     # One hot encoding the Y matrix words 
#     Y_row[context_word_index] = 1

#     # Appending to the main matrices
#     X.append(X_row)
#     Y.append(Y_row)

# # Converting the matrices into an array
# X = np.asarray(X)
# Y = np.asarray(Y)

Arrays are fed into a NN to get the weights from input to hidden layers = embeddings
* at this point the output of the NN is not relevant
* each vector can be plotted to see relationship between words
* revisit for more understanding

#### Implementing Word2Vec using Gensim Library
* using word2vec for word embedding (vectorization) for creating word vectors with Gensim
* word embedding approaches: BoW, TF-IDF, Word2Vec
* BoW 
    * doesn't need a lot of data, 
    * but creates sparse data set
    * no context
* TfIDF 
    * Term frequence = (Number of Occurences of a word)/(Total words in the document)
    * IDF(word) = Log((Total number of documents)/(Number of documents containing the word))
    * IDF value for the word "rain" is 0.1760, since the total number of documents is 3 and rain appears in 2 of them, therefore log(3/2) is 0.1760.
* Word2Vec - relationships between words are learned through a NN
    * SkipGram Model
        * context words predicted from base word
        * love to dance - love and dance predicted from 'to'
    * Continuous Bag of Words 
        * focus words predicted from context
* Pro: semantic meaning retained, context is kept, no sparse vectors, Cons: more complex 

In [35]:
#lxml and bs4 for webscraping wikipedia 
import bs4 as bs
import urllib.request
import re
import nltk

# downloaded wikipedia article, read article content, and parsed with beautiful soup
#  take only ones from paragraphs
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
article = scrapped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""
# join together all the paragraphs
for p in paragraphs:
    article_text += p.text

In [37]:
# preprocessing - lower, remove non-letters, change whitespace to space
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

#
# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article) # to sentences 

all_words = [nltk.word_tokenize(sent) for sent in all_sentences] # to words

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [38]:
# all_words not contains the list of all the words in the article 
# pass through word2Vec using gensim 

from gensim.models import Word2Vec
# min count - words must appear at least twice in the corpus
word2vec = Word2Vec(all_words, min_count=2)

In [41]:
# to see the dictionary of unique words >=2 
vocabulary = list(word2vec.wv.index_to_key)
print(vocabulary[0:10])

['ai', 'intelligence', 'artificial', 'learning', 'human', 'used', 'research', 'machine', 'use', 'problems']


In [42]:
# words converted to vectors 
# to find vectors of a word
# word2Vec not as affected by size of vocab as in BoW
v1 = word2vec.wv['artificial']

In [44]:
v1[0:10]

array([-0.00194258,  0.00820788, -0.00509809, -0.00056012,  0.00799723,
       -0.00197003, -0.00084592,  0.0153349 , -0.01140489,  0.00494688],
      dtype=float32)

In [45]:
sim_words = word2vec.wv.most_similar('intelligence')

In [46]:
sim_words

[('ai', 0.5406545996665955),
 ('knowledge', 0.459836483001709),
 ('science', 0.4470387399196625),
 ('also', 0.445144921541214),
 ('artificial', 0.4414222538471222),
 ('logic', 0.43143799901008606),
 ('intelligent', 0.42686840891838074),
 ('used', 0.4217250645160675),
 ('rights', 0.4127027988433838),
 ('use', 0.4124809801578522)]