# Example of training our CBOW model

## Upload data

In [1]:
import pandas as pd

corpus = pd.read_csv('corpus.csv')
corpus

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,Text
0,David Blunkett in quotes\n \n David Blunkett -...
1,Benitez issues warning to Gerrard\n \n Liverpo...
2,Brookside creator's Channel 4 bid\n \n The cre...
3,Brown visits slum on Africa trip\n \n Chancell...
4,Gritty return for Prince of Persia\n \n Still ...
...,...
495,Parker's saxophone heads auction\n \n A saxoph...
496,Reliance unit loses Anil Ambani\n \n Anil Amba...
497,Wal-Mart fights back at accusers\n \n Two big ...
498,MCI shareholder sues to stop bid\n \n A shareh...


## Preprocess Data

In [2]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer        # module for stemming
from nltk.stem import WordNetLemmatizer    # module for lemmatization


from string import punctuation

#Download the stopwords(if not already done)
#nltk.download('stopwords')
nltk.download('wordnet')

stemer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Input :
        text : string : a string of text
    Output :
        tokens : list : a list of tokens (strings)
    """
    #Tokenize the text
    tokens = word_tokenize(text)

    #Lowercase the tokens, remove ponctutation and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in punctuation and token not in stop_words]

    #Stem the tokens
    tokens = [stemer.stem(token) for token in tokens]

    #Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    #Remove special characters
    pattern = r'[^a-zA-Z\s]'  # Keep alphanumeric characters and whitespaces
    tokens = [re.sub(pattern, '', token) for token in tokens]

    #Remove single characters
    tokens = [token for token in tokens if len(token)>1]



    return tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#Apply the function to the text column
corpus['Preprocessed_text'] = corpus['Text'].apply(preprocess_text)

In [4]:
vocab = set(corpus['Preprocessed_text'].sum())

In [5]:
len(vocab)

10788

## Training CBOW Model
Now that we have our vocabulary and corpus, we can train the CBOW model

In [6]:
#Import CBOW class
from CBOW import CBOW

#Create a CBOW model
cbow = CBOW(corpus['Preprocessed_text'], vocab,window_size=10,embedding_dim=100)

In [7]:
#Train the model
cbow.train(batch_size=32,epochs=2)

  model.fit_generator(generator, steps_per_epoch=steps_per_epoch, epochs=epochs)


Epoch 1/2
Epoch 2/2


In [8]:
#Get the model
model = cbow.model

#Save the model
import tensorflow as tf
tf.saved_model.save(model, "CBOW_V10k_E100_W10_Epochs10")

INFO:tensorflow:Assets written to: CBOW_V10k_E100_W10_Epochs10\assets


## Working with the model

In [9]:
# Accessing the embedding layer
embedding_layer = model.layers[0]

# Get the embedding of a word
def get_embedding(word):
    try :
        word_index = cbow.vocab_indexed[word]
    except KeyError:
        print('KeyError : 'f'"{word}" not in the vocabulary.')
        return
    return embedding_layer.get_weights()[0][word_index]

get_embedding('interfvent')

KeyError : "interfvent" not in the vocabulary.


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity between two vectors
def cos_sim(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))

# Get the most similar words to a given word
def most_similar(word, vocab, embedding_layer, topn=5):
    word_embedding = get_embedding(word)
    similarities = []
    for w in vocab:
        if w != word:
            w_embedding = get_embedding(w)
            similarity = cos_sim(word_embedding, w_embedding)
            similarities.append((w, similarity))
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:topn]

most_similar('sad', vocab, embedding_layer, topn=5)

[('decid', array([[0.3937688]], dtype=float32)),
 ('commonplac', array([[0.38004914]], dtype=float32)),
 ('lifethreaten', array([[0.3661789]], dtype=float32)),
 ('kaprano', array([[0.35764784]], dtype=float32)),
 ('tremor', array([[0.3488307]], dtype=float32))]

> The words may be not that similar, because the model was trained on a small corpus.