# <font color=purple>Trying to understand how Word2Vec works</font>

In [1]:
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import logging 

In [2]:
import re
import itertools
import math
import string
import nltk
# Access the Gutenberg Corpus
from nltk.corpus import gutenberg
import pandas as pd
import numpy as np
import random

from nltk.probability import FreqDist

from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords') #<-- we run this command to download the stopwords in the project
nltk.download('punkt') #<-- essential for tokenization
nltk.download('gutenberg') #<-- corpus for training the model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

# Function from lecturer
## <font color=purple>The function that cleans the data</font>

In [None]:
def clean(inp: str) -> str:

    inp = inp.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))
    inp = re.sub(r'\s+', ' ', inp.lower())

    return inp

In [None]:
# from a list of sentences
documents = ["If you use a car frequently, the first step to cutting",
             "down your emissions may well be to simply", 
             "fully consider the", 
             "alternatives available to you."
             ]
clean(str(documents))

' if you use a car frequently the first step to cutting down your emissions may well be to simply fully consider the alternatives available to you '

# <font color=purple>To train my own Word2Vec model I need:</font>
1. **Obtain a large corpus of text**: You will need a large amount of text data to train your Word2vec model. You can obtain text data from various sources such as Wikipedia, news articles, or social media posts.
2. **Preprocess the text**: Before training the Word2vec model, you will need to preprocess the text data by removing stop words, punctuations, and other non-essential elements. You can also tokenize the text data into words or phrases to prepare it for training.
3. **Choose a Word2vec algorithm**: There are two main algorithms for training Word2vec models: Continuous Bag of Words (CBOW) and Skip-gram. CBOW predicts a word based on its context, while Skip-gram predicts the context based on a word. Choose the algorithm that best suits your needs.
4. **Train the Word2vec model**: You can train the Word2vec model using popular libraries such as Gensim or TensorFlow. These libraries provide easy-to-use functions to train and test the Word2vec model. During training, the model learns to associate each word in the vocabulary with a vector of real numbers, which represent the word embedding.
5. **Evaluate the Word2vec model**: Once the Word2vec model is trained, you can evaluate its performance using intrinsic or extrinsic evaluation methods. Intrinsic evaluation focuses on evaluating specific aspects of the model in isolation, such as its ability to generate embeddings or to classify sentences based on sentiment. Extrinsic evaluation measures the performance of the model on a specific task, such as language modeling or sentiment analysis.
6. **Use the Word2vec model**: After training and evaluating the Word2vec model, you can use it for various natural language processing tasks, such as information retrieval, text classification, or machine translation.

## <font color=blue>Obtain a large corpus of text</font>
> Попробуем сначала поработать с корпусом Gutenberg (классическая литература), если что, переключимся на Brown (новости на английском)

In [4]:
# Print the books in the Gutenberg Corpus
print(gutenberg.fileids(), '\n')

# Print the first few sentences of a book
sentences = gutenberg.sents('carroll-alice.txt')
for sentence in sentences[:5]:
    print(sentence)

# Transform data from nested lists to one list
result1 = list(itertools.chain(*sentences))
print(result1[:20])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt'] 

['[', 'Alice', "'", 's', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865', ']']
['CHAPTER', 'I', '.']
['Down', 'the', 'Rabbit', '-', 'Hole']
['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', ',', "'", 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'bo

## <font color=blue>Preprocess the text</font>

Source: https://towardsdatascience.com/how-to-train-a-word2vec-model-from-scratch-with-gensim-c457d587e031

In [5]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    """Function that cleans the input text by going to:
    - remove links
    - remove special characters
    - remove numbers
    - remove stopwords
    - convert to lowercase
    - remove excessive white spaces
    Arguments:
        text (str): text to clean
        remove_stopwords (bool): whether to remove stopwords
    Returns:
        str: cleaned text
    """
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove numbers and special characters
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. create tokens
        tokens = nltk.word_tokenize(text)
        # 2. check if it's a stopword
        tokens = [w.lower().strip() for w in tokens if not w.lower() in stopwords.words("english")]
        # return a list of cleaned tokens
        return tokens

In [6]:
df = pd.DataFrame({'sentences': pd.Series(sentences).astype(str)})
df['cleaned'] = df.sentences.apply(lambda x: preprocess_text(x, remove_stopwords=True))
texts = df.cleaned.tolist()
texts[:3]

[['alice', 'adventures', 'wonderland', 'lewis', 'carroll'],
 ['chapter'],
 ['rabbit', 'hole']]

## <font color=blue>Train the Word2Vec model</font>

### <font color=scarlett>Word2Vec Model Lite</font> Solution

In [None]:
import numpy as np
import string

def tokenize(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Split text into tokens (words)
    tokens = text.split()
    return [tokens]

def build_vocab(sentences):
    words = []
    for sentence in sentences:
        words.extend(sentence)
    words = sorted(set(words))
    word2id = {w: i for i, w in enumerate(words)}
    id2word = {i: w for w, i in word2id.items()}
    return word2id, id2word
    
def skipgrams(sentence, window_size):
    pairs = []
    for i, w in enumerate(sentence):
        for j in range(i - window_size, i + window_size + 1):
            if j != i and j >= 0 and j < len(sentence):
                pairs.append((w, sentence[j]))
    return pairs

def initialize_weights(vocabulary_size, vector_size):
    W1 = np.random.uniform(-0.5, 0.5, size=(vocabulary_size, vector_size))
    W2 = np.random.uniform(-0.5, 0.5, size=(vocabulary_size, vector_size))
    return W1, W2

def pred_function(x):
    exp_scores = np.exp(x - np.max(x))
    return exp_scores / np.sum(exp_scores, axis=0)
    
def update_weights(W1, W2, target_word_idx, context_words, dL_dh, dL_du, learning_rate):
    W2 -= learning_rate * np.outer(W1[target_word_idx], dL_du)
    W1[target_word_idx] -= learning_rate * dL_dh

def train_old(pairs, word2id, W1, W2, learning_rate, epochs, vector_size):
    for epoch in range(epochs):
        loss_history = []
        loss = 0
        for pair in pairs:
            center_word = np.zeros((vector_size,))
            context_word = np.zeros((vector_size,))
            u = word2id[pair[0]]
            v = word2id[pair[1]]
            center_word = W1[u]
            for context in range(len(word2id)):
                if context == v:
                    context_word = W2[context]
                    z = np.dot(center_word, context_word)
                    sig = pred_function(z)
                    e = (1 - int(context == v)) - sig
                    loss += e**2
                    grad_sig = e * learning_rate
                    grad_context_word = center_word * grad_sig
                    grad_center_word = context_word * grad_sig
                    W1[u] -= grad_center_word
                    W2[context] -= grad_context_word
        # loss_history.append(loss / len(pairs))
    return W1, W2


def train(data: str):
    # All parameters for training the Word2Vec model
    window_size=3
    vector_size=10
    learning_rate=0.001
    epochs=100

    # Create tokens
    text = tokenize(data)

    # Build vocabulary
    word2id, id2word = build_vocab(text)

    # Generate skip-grams
    pairs = []
    for sentence in data:
        pairs.extend(skipgrams(sentence, window_size))

    # Initialize weights
    W, W_context = initialize_weights(len(word2id), vector_size)

    # Train model
    W, W_context = train_old(pairs, word2id, W, W_context, learning_rate, epochs, vector_size)

    # Create final dictionary
    dict_final = {key: W[word2id[key]] for key in word2id.keys()}

    return dict_final

In [None]:
train(clean(str(documents)))

---
### <font color=scarlett>Word2Vec Model</font> Solution

#### <font color=green>Best option (gensim)</font>

In [None]:
model.wv['hole']

array([-0.07938354, -0.14386038, -0.0769747 ,  0.13086031, -0.19609441],
      dtype=float32)

#### <font color=green>Custom model</font>

In [26]:
# import numpy as np
# import string

def tokenize(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Split text into tokens (words)
    tokens = text.split()
    return [tokens]

def build_vocab(sentences):
    words = []
    for sentence in sentences:
        words.extend(sentence)
    words = sorted(set(words))
    word2id = {w: i for i, w in enumerate(words)}
    id2word = {i: w for w, i in word2id.items()}
    return word2id, id2word
    
def skipgrams(sentence, window_size):
    pairs = []
    for i, w in enumerate(sentence):
        for j in range(i - window_size, i + window_size + 1):
            if j != i and j >= 0 and j < len(sentence):
                pairs.append((w, sentence[j]))
    return pairs

def initialize_weights(vocabulary_size, vector_size):
    W1 = np.random.uniform(-0.5, 0.5, size=(vocabulary_size, vector_size))
    W2 = np.random.uniform(-0.5, 0.5, size=(vocabulary_size, vector_size))
    return W1, W2

def pred_function(x):
    # exp_scores = np.exp(x - np.max(x))
    # return exp_scores / np.sum(exp_scores, axis=0)
    return 1 / (1 + np.exp(-x))
    
def update_weights(W1, W2, target_word_idx, context_words, dL_dh, dL_du, learning_rate):
    W2 -= learning_rate * np.outer(W1[target_word_idx], dL_du)
    W1[target_word_idx] -= learning_rate * dL_dh

# def train_old(pairs, word2id, W1, W2, learning_rate, epochs, vector_size, neg_samples=5):
#     vocab_size = len(word2id)
#     for epoch in range(epochs):
#         loss_history = []
#         loss = 0
#         for pair in pairs:
#             center_word_id = word2id[pair[0]]
#             context_word_id = word2id[pair[1]]
#             center_word = W1[center_word_id]

#             # Positive sample
#             context_word = W2[context_word_id]
#             z = np.dot(center_word, context_word)
#             sig = pred_function(z)
#             e = 1 - sig
#             loss += -np.log(sig)
#             grad_sig = e * learning_rate
#             grad_context_word = center_word * grad_sig
#             grad_center_word = context_word * grad_sig
#             W1[center_word_id] -= grad_center_word
#             W2[context_word_id] -= grad_context_word

#             # Negative samples
#             for _ in range(neg_samples):
#                 negative_word_id = random.randint(0, vocab_size - 1)
#                 negative_word = W2[negative_word_id]
#                 z = np.dot(center_word, negative_word)
#                 sig = pred_function(z)
#                 e = -sig
#                 loss += -np.log(1 - sig)
#                 grad_sig = e * learning_rate
#                 grad_negative_word = center_word * grad_sig
#                 grad_center_word = negative_word * grad_sig
#                 W1[center_word_id] -= grad_center_word
#                 W2[negative_word_id] -= grad_negative_word

#         loss_history.append(loss / len(pairs))
#     return W1, W2

def train_old(pairs, word2id, W1, W2, learning_rate, epochs, vector_size):
    for epoch in range(epochs):
        loss_history = []
        loss = 0
        for pair in pairs:
            center_word = np.zeros((vector_size,))
            context_word = np.zeros((vector_size,))
            u = word2id[pair[0]]
            v = word2id[pair[1]]
            center_word = W1[u]
            for context in range(len(word2id)):
                if context == v:
                    context_word = W2[context]
                    z = np.dot(center_word, context_word)
                    sig = pred_function(z)
                    e = (1 - int(context == v)) - sig
                    loss += e**2
                    grad_sig = e * learning_rate
                    grad_context_word = center_word * grad_sig
                    grad_center_word = context_word * grad_sig
                    W1[u] -= grad_center_word
                    W2[context] -= grad_context_word
        # loss_history.append(loss / len(pairs))
    return W1, W2

def train(data: str):
    # All parameters for training the Word2Vec model
    window_size=3
    vector_size=5
    learning_rate=0.025
    epochs=30

    # Create tokens
    # text = tokenize(data)

    # Build vocabulary
    # word2id, id2word = build_vocab(text)
    word2id, id2word = build_vocab(data)

    # Generate skip-grams
    pairs = []
    for sentence in data:
        pairs.extend(skipgrams(sentence, window_size))

    # Initialize weights
    W, W_context = initialize_weights(len(word2id), vector_size)

    # Train model
    W, W_context = train_old(pairs, word2id, W, W_context, learning_rate, epochs, vector_size)

    # Create final dictionary
    dict_final = {key: W[word2id[key]] for key in word2id.keys()}

    return dict_final

In [None]:
model1 = train(texts)

  return 1 / (1 + np.exp(-x))


In [None]:
model1['hole']

In [8]:
model.wv['hole']

array([-0.00364718,  0.0114101 ,  0.00216129,  0.00318004, -0.00370108,
       -0.01124482,  0.01984751,  0.01653038, -0.0095235 , -0.00840776,
       -0.01029086, -0.02910182,  0.00525241,  0.00918678, -0.00335718,
       -0.01273139,  0.0155248 , -0.01784102, -0.00150149, -0.03703127,
        0.00355653, -0.00233406, -0.00379348, -0.00422645, -0.01526637,
       -0.00682587, -0.00961521, -0.01189812, -0.0089136 ,  0.00415363,
        0.00917382,  0.01876477,  0.0096279 , -0.00309083, -0.00613369,
        0.01139245,  0.00194349, -0.00701662,  0.00142381, -0.03124756,
       -0.00095404, -0.01266169, -0.00870354,  0.00075566,  0.02220594,
       -0.00089491, -0.00937902,  0.00529403,  0.01460433, -0.00393121,
        0.00481674, -0.01744236, -0.00675461, -0.01041857, -0.02255509,
        0.01236566,  0.00588562,  0.00314462, -0.00992667,  0.00072798,
        0.00764192, -0.00057403,  0.00081573, -0.01322731, -0.02228702,
        0.00893002,  0.00415908,  0.0079179 , -0.02328489,  0.01

In [7]:
model = Word2Vec(sentences=texts)