# <font color=purple>Trying to understand how Word2Vec works</font>

In [9]:
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import logging 

# nltk.download('stopwords') #<-- we run this command to download the stopwords in the project
# nltk.download('punkt') #<-- essential for tokenization
# nltk.download('gutenberg') #<-- corpus for training the model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [1]:
import re
import itertools
import math
import string
import nltk
# Access the Gutenberg Corpus
from nltk.corpus import gutenberg
import pandas as pd
import numpy as np

from nltk.probability import FreqDist

from nltk.corpus import stopwords

# Function from lecturer
## <font color=purple>The function that cleans the data</font>

In [5]:
def clean(inp: str) -> str:

    inp = inp.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))
    inp = re.sub(r'\s+', ' ', inp.lower())

    return inp

In [6]:
# from a list of sentences
documents = ["If you use a car frequently, the first step to cutting",
             "down your emissions may well be to simply", 
             "fully consider the", 
             "alternatives available to you."
             ]
clean(str(documents))

' if you use a car frequently the first step to cutting down your emissions may well be to simply fully consider the alternatives available to you '

# <font color=purple>To train my own Word2Vec model I need:</font>
1. **Obtain a large corpus of text**: You will need a large amount of text data to train your Word2vec model. You can obtain text data from various sources such as Wikipedia, news articles, or social media posts.
2. **Preprocess the text**: Before training the Word2vec model, you will need to preprocess the text data by removing stop words, punctuations, and other non-essential elements. You can also tokenize the text data into words or phrases to prepare it for training.
3. **Choose a Word2vec algorithm**: There are two main algorithms for training Word2vec models: Continuous Bag of Words (CBOW) and Skip-gram. CBOW predicts a word based on its context, while Skip-gram predicts the context based on a word. Choose the algorithm that best suits your needs.
4. **Train the Word2vec model**: You can train the Word2vec model using popular libraries such as Gensim or TensorFlow. These libraries provide easy-to-use functions to train and test the Word2vec model. During training, the model learns to associate each word in the vocabulary with a vector of real numbers, which represent the word embedding.
5. **Evaluate the Word2vec model**: Once the Word2vec model is trained, you can evaluate its performance using intrinsic or extrinsic evaluation methods. Intrinsic evaluation focuses on evaluating specific aspects of the model in isolation, such as its ability to generate embeddings or to classify sentences based on sentiment. Extrinsic evaluation measures the performance of the model on a specific task, such as language modeling or sentiment analysis.
6. **Use the Word2vec model**: After training and evaluating the Word2vec model, you can use it for various natural language processing tasks, such as information retrieval, text classification, or machine translation.

## <font color=blue>Obtain a large corpus of text</font>
> Попробуем сначала поработать с корпусом Gutenberg (классическая литература), если что, переключимся на Brown (новости на английском)

In [2]:
# Print the categories in the Brown Corpus
print(gutenberg.fileids(), '\n')

# Print the first few sentences of a book
sentences = gutenberg.sents('carroll-alice.txt')
for sentence in sentences[:5]:
    print(sentence)

# Transform data from nested lists to one list
result1 = list(itertools.chain(*sentences))
print(result1[:20])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt'] 

['[', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865', ']']
['CHAPTER', 'I', '.']
['Down', 'the', 'Rabbit', '-', 'Hole']
['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', ',', "'", 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'bo

## <font color=blue>Preprocess the text</font>

Source: https://towardsdatascience.com/how-to-train-a-word2vec-model-from-scratch-with-gensim-c457d587e031

In [3]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    """Function that cleans the input text by going to:
    - remove links
    - remove special characters
    - remove numbers
    - remove stopwords
    - convert to lowercase
    - remove excessive white spaces
    Arguments:
        text (str): text to clean
        remove_stopwords (bool): whether to remove stopwords
    Returns:
        str: cleaned text
    """
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove numbers and special characters
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. create tokens
        tokens = nltk.word_tokenize(text)
        # 2. check if it's a stopword
        tokens = [w.lower().strip() for w in tokens if not w.lower() in stopwords.words("english")]
        # return a list of cleaned tokens
        return tokens

In [5]:
df = pd.DataFrame({'sentences': pd.Series(sentences).astype(str)})
df['cleaned'] = df.sentences.apply(lambda x: preprocess_text(x, remove_stopwords=True))
df.head()

Unnamed: 0,sentences,cleaned
0,"['[', 'Alice', ""'"", 's', 'Adventures', 'in', '...","[alice, adventures, wonderland, lewis, carroll]"
1,"['CHAPTER', 'I', '.']",[chapter]
2,"['Down', 'the', 'Rabbit', '-', 'Hole']","[rabbit, hole]"
3,"['Alice', 'was', 'beginning', 'to', 'get', 've...","[alice, beginning, get, tired, sitting, sister..."
4,"['So', 'she', 'was', 'considering', 'in', 'her...","[considering, mind, well, could, hot, day, mad..."


In [6]:
texts = df.cleaned.tolist()
texts[:3]

[['alice', 'adventures', 'wonderland', 'lewis', 'carroll'],
 ['chapter'],
 ['rabbit', 'hole']]

## <font color=blue>Train the Word2Vec model</font>

In [7]:
class Word2Vec:
    def __init__(self, window_size=2, vector_size=100, learning_rate=0.025, epochs=10):
        self.window_size = window_size
        self.vector_size = vector_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.word2id = {}
        self.id2word = {}
        self.W = None
        self.loss_history = []

    def _build_vocab(self, sentences):
        words = []
        for sentence in sentences:
            words.extend(sentence)
        words = sorted(set(words))
        self.word2id = {w: i for i, w in enumerate(words)}
        self.id2word = {i: w for w, i in self.word2id.items()}
        
    def _skipgrams(self, sentence):
        pairs = []
        for i, w in enumerate(sentence):
            for j in range(i - self.window_size, i + self.window_size + 1):
                if j != i and j >= 0 and j < len(sentence):
                    pairs.append((w, sentence[j]))
        print(pairs)
        return pairs

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def _initialize_weights(self):
        self.W = np.random.uniform(-0.5, 0.5, size=(len(self.word2id), self.vector_size))
        self.W_context = np.random.uniform(-0.5, 0.5, size=(len(self.word2id), self.vector_size))

    def _train(self, pairs):
        for epoch in range(self.epochs):
            loss = 0
            for pair in pairs:
                center_word = np.zeros((self.vector_size,))
                context_word = np.zeros((self.vector_size,))
                u = self.word2id[pair[0]]
                v = self.word2id[pair[1]]
                center_word = self.W[u]
                for context in range(len(self.word2id)):
                    if context == v:
                        context_word = self.W_context[context]
                        z = np.dot(center_word, context_word)
                        sig = self._sigmoid(z)
                        e = (1 - int(context == v)) - sig
                        loss += e**2
                        grad_sig = e * self.learning_rate
                        grad_context_word = center_word * grad_sig
                        grad_center_word = context_word * grad_sig
                        self.W[u] -= grad_center_word
                        self.W_context[context] -= grad_context_word
            self.loss_history.append(loss / len(pairs))

    def fit(self, sentences):
        self._build_vocab(sentences)
        pairs = []
        for sentence in sentences:
            pairs.extend(self._skipgrams(sentence))
        self._initialize_weights()
        self._train(pairs)
        
    def get_word_vector(self, word):
        idx = self.word2id[word]
        return {word: self.W[idx]}

    def most_similar(self, word, n=5):
        idx = self.word2id[word]
        word_vector = self.W[idx]
        sim = np.dot(self.W, word_vector)
        closest = np.argsort(sim)[::-1][:n]
        return [(self.id2word[idx], self.W[idx]) for idx in closest]
        # return {word: vec for word, vec in a}


In [None]:
# создание объекта модели
model = Word2Vec(window_size=2, vector_size=10, learning_rate=0.025, epochs=3)

# обучение модели на нашем тексте
model.fit(texts)

In [9]:
model.get_word_vector('hole')

{'hole': array([ 105.06730346,  -38.44200512,  -30.30935   ,   29.11156463,
        -159.95082517, -365.60176447, -192.61766741,  -69.12397709,
         503.97663284,  635.90433587])}

In [10]:
def train(data: str):
    """
    return: w2v_dict: dict
            - key: string (word)
            - value: np.array (embedding)
    """
    return {}