# Vectorization

## Import all needed libraries

In [1]:
# Data handling
import numpy as np
import pandas as pd

# Text processing
import re
import string
import emoji
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
df = pd.read_csv("preprocessed_text.csv")

In [3]:
df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...
1,Good,5,positive,good
2,👍👍,5,positive,thumb up thumb up
3,Good,3,neutral,good
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...


In [4]:
df.isnull().sum()

Content             0
Score               0
Sentiment           0
Content_cleaned    67
dtype: int64

In [5]:
df.fillna('', inplace=True)

## Bag of Words

This method creates literally a bag of words, without taking into account the semantic meaning of the words or their position in the sentence. First, all the inputs are tokenized. Then from all the unique tokens, the algorithm creates a vocabulary in alphabetical order. For every input sequence, the algorithm creates a matrix that has the length of the vocabulary and frequencies of each token are assigned to the corresponding index. The Bag of Words algorithm is implemented with the CountVectorizer function.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the data
bow = vectorizer.fit_transform(df['Content_cleaned'])

print(len(vectorizer.vocabulary_))
print(bow.shape)

# Convert sparse matrix to dense matrix
#bow_dense = bow.toarray()

# Add BoW vectors as a new column in the DataFrame
df['bow'] = list(bow)

# Show the DataFrame
df.head()

31451
(113292, 31451)


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,bow
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"(0, 20804)\t1\n (0, 26026)\t1\n (0, 11312)..."
1,Good,5,positive,good,"(0, 11478)\t1"
2,👍👍,5,positive,thumb up thumb up,"(0, 27621)\t2\n (0, 29212)\t2"
3,Good,3,neutral,good,"(0, 11478)\t1"
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"(0, 1571)\t1\n (0, 29353)\t1\n (0, 4536)\t..."


In [7]:
print(df['Content_cleaned'][2])
print(df['bow'][2])

thumb up thumb up
  (0, 27621)	2
  (0, 29212)	2


In [8]:
sorted_vocab_keys = sorted(vectorizer.vocabulary_.keys())
print(f"27621 is {sorted_vocab_keys[27621]}.")
print(f"29212 is {sorted_vocab_keys[29212]}.")

27621 is thumb.
29212 is up.


We notice that the produced vocabulary is of size 31451, while our bag of words has 113292 vectors, each having the size of the vocabulary. 

In the example we see the that both words "thumb" and "up" get value of 2.

Positive: 
- Sequences have a fixed size.

Negative:
- Very high dimensions.
- Order of words or semantic meaning is not preserved.
- If we have a new sequence that contains new words that are not part of our vocabulary, it will not work.

## TF-IDF


TF-IDF, or Term Frequency- Inverse Document Frequency, is an algorithm that creates a frequency-based vocabulary, like Bag of Words, but unlike that, it takes word importance into consideration. Basically, it considers that if a word is part of a lot of sentences/sequences, then it must not be very important. However, if a word is present in only a few sentences/sequences, then it must be of high importance. This way words that get repeated too often don’t overpower less frequent but important words. The formula for words in a sentence/sequence is as follows:
- TF(x) = (frequency of word 'x' in a sequence)/(total number of words in the sequence).
- IDF(x) = log((total number of sequences)/(number of sequences that contain word 'x')).
- TF-IDF(x) - TF(x) * IDF(x).

In IDF(x) the document frequency is inversed so the more common a word is across all documents, the lesser its importance is for the current document.


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the model and transform the data
tfidf = vectorizer.fit_transform(df['Content_cleaned'])

print(len(vectorizer.vocabulary_))
print(tfidf.shape)

df['tfidf'] = [tfidf[i] for i in range(tfidf.shape[0])]

# Show the DataFrame
df.head()

31451
(113292, 31451)


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,bow,tfidf
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"(0, 20804)\t1\n (0, 26026)\t1\n (0, 11312)...","(0, 8867)\t0.18773482827989177\n (0, 18672)..."
1,Good,5,positive,good,"(0, 11478)\t1","(0, 11478)\t1.0"
2,👍👍,5,positive,thumb up thumb up,"(0, 27621)\t2\n (0, 29212)\t2","(0, 29212)\t0.5498571095671961\n (0, 27621)..."
3,Good,3,neutral,good,"(0, 11478)\t1","(0, 11478)\t1.0"
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"(0, 1571)\t1\n (0, 29353)\t1\n (0, 4536)\t...","(0, 7121)\t0.43645434017309587\n (0, 31200)..."


In [10]:
print(df['Content_cleaned'][2])
print(df['tfidf'][2])

thumb up thumb up
  (0, 29212)	0.5498571095671961
  (0, 27621)	0.8352587377923134


In [11]:
sorted_vocab_keys = sorted(vectorizer.vocabulary_.keys())
print(f"27621 is {sorted_vocab_keys[27621]}.")
print(f"29212 is {sorted_vocab_keys[29212]}.")

27621 is thumb.
29212 is up.


We see that just like Bag of Words, we have a vocabulary of 31451 size and 113292 vectors of the same size.

In the example we see that unlike Bag of Words, where both words got value 2, the word "thumb" gets a higher value than the word "up", meaning it is of more importance. The word "thumb" must exist in less sequences than the word "up", making it more significant.

Positive: 
- Sequences have a fixed size.
- Some word importance is considered, unlike Bag of Words.

Negative:
- Very high dimensions.
- Order of words is still not preserved.
- Again if we have a new sequence that contains new words that are not part of our vocabulary, it will not work.

# Word2Vec

Word2Vec is a neural network-based model for learning word embeddings. Unlike in the frequency-based vectorization algorithms, the vector representation of words was said to be contextually aware. Since every word is represented as an n-dimensional vector, one can imagine that all of the words are mapped to this n-dimensional space in such a manner that words having similar meanings exist in close proximity to one another in this hyperspace. 

There are two main ways to implement Word2Vec, CBoW and Skip-Gram.

### CBoW

In CBoW, or Continuous Bag of Words, a NN with a single hidden layer is trained. It takes as input context (vincinity) words and its goal is to predict the current word. For example if we have the sentence "the small kid ate a banana", if we have vincinity=2, an input to the model can be (small, kid, a, banana) and the output will be "ate". In this algorithm we choose a vincinity number m and then for every word in our sequences a dataset is prepared taking the m neighboring words as inputs and the word as a target. All words are turned into one-hot-encodings. Then a NN with a single layer is trained. In the end, we will not use the actual NN anywhere, but we will use the hidden-to-output weight vector as a word embeddings matrix. The size of this matrix is the size of the hidden layer and we can define it as a hyperparameter. Let's say in our case the vocabulary is of size 34326. If we choose a hidden layer of 300 size, then the word embeddings matrix will be of size 34326x300, since every word will be an one-hot vector of 1x34326 size. Then we multiply our word with the embedding matrix and we get a vector of 1x300 size, which is our final goal.

### Skip-Gram

Skip-Gram is the exact mirrored process of CBoW, in the sense that instead of feeding the network context words and trying to predict the current word, we feed the network the current word and it tries to predict context (vincinity) words. For example if we have the sentence "the small kid ate a banana", if we have vincinity=2, an input to the model can be "ate" and the output will be (small, kid, a, banana). Again in this algorithm we choose a vincinity number m and then for every word in our sequences a dataset is prepared taking the m neighboring words as targets and the word as input. Then a NN is trained and the input-to-hidden weights are taken as word embeddings. Then the vectorizing of our dataset is done the same way as in CBoW.

### Differences

Skip-Gram is better when the dataset is small and emphasis on rare words is given. CBoW is better when the dataset is bigger, can better represent frequent words and it is faster to train.


There is the possibility to use pretrained word embeddings or train a new model ourselves. The pretrained usually used is provided by Google. In this notebook we will try both of them and see how they compare, both in vectorizing and later in our models.

In [12]:
from gensim import models

In [13]:
w2v = models.KeyedVectors.load_word2vec_format(
'../GoogleNews-vectors-negative300.bin', binary=True)

In [14]:
def get_average_word2vec(tokens_list, model, vector_size):
    """
    This function computes the average Word2Vec for a given list of tokens.
    """
    # Filter the tokens that are present in the Word2Vec model
    valid_tokens = [token for token in tokens_list if token in model]
    if not valid_tokens:
        return np.zeros(vector_size)
    
    # Compute the average Word2Vec
    word_vectors = [model[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Tokenize the text data
df['tokens'] = df['Content_cleaned'].apply(lambda x: x.split())

# Compute the average Word2Vec for each row
vector_size = w2v.vector_size
df['word2vec_pretrained'] = df['tokens'].apply(lambda x: get_average_word2vec(x, w2v, vector_size))

df.head()

Unnamed: 0,Content,Score,Sentiment,Content_cleaned,bow,tfidf,tokens,word2vec_pretrained
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"(0, 20804)\t1\n (0, 26026)\t1\n (0, 11312)...","(0, 8867)\t0.18773482827989177\n (0, 18672)...","[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425..."
1,Good,5,positive,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328..."
2,👍👍,5,positive,thumb up thumb up,"(0, 27621)\t2\n (0, 29212)\t2","(0, 29212)\t0.5498571095671961\n (0, 27621)...","[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859..."
3,Good,3,neutral,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"(0, 1571)\t1\n (0, 29353)\t1\n (0, 4536)\t...","(0, 7121)\t0.43645434017309587\n (0, 31200)...","[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535..."


In [15]:
import multiprocessing

def get_average_word2vec2(tokens_list, model, vector_size):
    valid_tokens = [token for token in tokens_list if token in model.wv]
    if not valid_tokens:
        return np.zeros(vector_size)
    word_vectors = [model.wv[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define model parameters
vector_size = 300   # Dimensionality of the word vectors
window_size = 5     # Context window size
min_count = 1       # Minimum word frequency
workers = multiprocessing.cpu_count()  # Number of worker threads to use

# Train the Word2Vec model
cbow = models.Word2Vec(df['tokens'].tolist(), vector_size=vector_size, sg=0, window=window_size, min_count=min_count, workers=workers)

# Save the model
model_path = "cbow.model"
cbow.save(model_path)

print(f"Model saved at {model_path}")

df['word2vec_cbow'] = df['tokens'].apply(lambda x: get_average_word2vec2(x, cbow, vector_size))

# Train the Word2Vec model
skipgram = models.Word2Vec(df['tokens'].tolist(), vector_size=vector_size, sg=1, window=window_size, min_count=min_count, workers=workers)

# Save the model
model_path = "skipgram.model"
skipgram.save(model_path)

print(f"Model saved at {model_path}")

df['word2vec_skipgram'] = df['tokens'].apply(lambda x: get_average_word2vec2(x, skipgram, vector_size))

df.head()

Model saved at cbow.model
Model saved at skipgram.model


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,bow,tfidf,tokens,word2vec_pretrained,word2vec_cbow,word2vec_skipgram
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"(0, 20804)\t1\n (0, 26026)\t1\n (0, 11312)...","(0, 8867)\t0.18773482827989177\n (0, 18672)...","[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425...","[0.20826791, 0.0046617766, -0.04649164, -0.117...","[0.1006552, 0.2239003, -0.07527001, -0.1536161..."
1,Good,5,positive,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[0.20965211, -0.06506327, -0.08436263, 0.04491..."
2,👍👍,5,positive,thumb up thumb up,"(0, 27621)\t2\n (0, 29212)\t2","(0, 29212)\t0.5498571095671961\n (0, 27621)...","[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859...","[0.12779036, 0.83761936, -0.8175876, 0.9235396...","[-0.019263022, 0.4243198, -0.3776014, -0.08818..."
3,Good,3,neutral,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[0.20965211, -0.06506327, -0.08436263, 0.04491..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"(0, 1571)\t1\n (0, 29353)\t1\n (0, 4536)\t...","(0, 7121)\t0.43645434017309587\n (0, 31200)...","[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535...","[-0.16519807, 0.1276928, 0.13958092, -0.040613...","[0.06709646, 0.07583677, -0.0061207726, -0.034..."


In [16]:
print(cbow.wv.most_similar("movie"))

[('film', 0.7696489691734314), ('stuff', 0.6302829384803772), ('show', 0.5975726246833801), ('drama', 0.5592541694641113), ('genre', 0.5579929947853088), ('category', 0.5558158755302429), ('series', 0.5527818202972412), ('program', 0.5504778027534485), ('programme', 0.5363008379936218), ('kdrama', 0.5327799916267395)]


In [17]:
print(skipgram.wv.most_similar("movie"))

[('flim', 0.8157294988632202), ('kdramas', 0.79478919506073), ('flick', 0.7657655477523804), ('oldie', 0.7626849412918091), ('sitcom', 0.7523657083511353), ('limitless', 0.7519016861915588), ('trendy', 0.7518705725669861), ('rerun', 0.7517079710960388), ('spectacular', 0.7492549419403076), ('binging', 0.7458791136741638)]


In [18]:
print(w2v.most_similar("movie"))

[('film', 0.8676770329475403), ('movies', 0.8013108372688293), ('films', 0.7363011837005615), ('moive', 0.6830360889434814), ('Movie', 0.6693680286407471), ('horror_flick', 0.6577848792076111), ('sequel', 0.6577793955802917), ('Guy_Ritchie_Revolver', 0.650975227355957), ('romantic_comedy', 0.6413198709487915), ('flick', 0.6321909427642822)]


When calculating the vectors of a sequence, we calculate the word embedding of every token seperately and then we average over them to get the sequence vector. This way the overall semantic meaning of the text is captured, while each word contributes to the final vector, allowing the resulting vector to represent the combined meanings of the individual words.

In the above example of the word "movie" we see what similar words the 3 different models are giving. For all 3 of the different embeddings, we see similar words, however CBoW seems to be better than Skip-Gram. We can notice that CBoW gives more frequent and correct words, while Skip-Gram might be giving more rare words. For this reason we will continue with the pretrained version and CBoW.

Positive:
- Sequences have a fixed size.
- The dimensionality is exceptionally reduced, compared to the frequency-based methods.
- Semantics and context are the base element of the vectors.

Negative:
- Order of words is still not preserved.
- Again we work on a finite vocabulary, based on the corpus our model was trained on. If we have a new sequence with new words, they will not be taken into consideration, possibly losing important information.

In [19]:
df = df.drop(columns = ['word2vec_skipgram'])

# GloVe

GloVe, or Global Vectors, is an algorithm that like Word2Vec is creating contextual embeddings. The difference is that while Word2Vec only considers local information according to the surroundings of a word, GloVe captures both local and global statistics. It relies in the word-word co-occurrence probabilities with a set window-size in our dataset. Again like Word2Vec a NN is trained and the weight matrix is taken as the word embedding matrix.

Usually pre-trained GloVe embeddings are used, which are trained on huge datasets. In this notebook we will use the **GloVe 6b**, which is trained on Wikipedia and Gigaword on 6 billion tokens and 400K vocabulary size, and the **GloVe Twitter**, which is trained with 2 billion tweets, 27 billion tokens and 1.2 million vocabulary size. Both embeddings come in 50,100,200 vector sizes. We will use the 100d embeddings to redude dimensionality. We use the first version as a general-purpose embedding and the second version as a more specified embedding, since tweets and netflix reviews can be similar and contain slang and informal language.

In [20]:
# Path to the GloVe embeddings file
glove_file = '../glove.6B.100d.txt'

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe embeddings
glove_6b = load_glove_embeddings(glove_file)
print(f"Loaded {len(glove_6b)} word vectors from GloVe.")

# Define a function to get the average GloVe vector for a list of tokens
def get_average_glove(tokens_list, embeddings, embedding_dim):
    valid_tokens = [token for token in tokens_list if token in embeddings]
    if not valid_tokens:
        return np.zeros(embedding_dim)
    word_vectors = [embeddings[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define the embedding dimension (e.g., 100 for 'glove.6B.100d.txt')
embedding_dim = 100

# Compute the average GloVe vector for each row
df['glove_6B'] = df['tokens'].apply(lambda x: get_average_glove(x, glove_6b, embedding_dim))

df.head()

Loaded 400000 word vectors from GloVe.


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,bow,tfidf,tokens,word2vec_pretrained,word2vec_cbow,glove_6B
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"(0, 20804)\t1\n (0, 26026)\t1\n (0, 11312)...","(0, 8867)\t0.18773482827989177\n (0, 18672)...","[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425...","[0.20826791, 0.0046617766, -0.04649164, -0.117...","[-0.1198448, 0.12636456, 0.41294017, -0.217294..."
1,Good,5,positive,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739..."
2,👍👍,5,positive,thumb up thumb up,"(0, 27621)\t2\n (0, 29212)\t2","(0, 29212)\t0.5498571095671961\n (0, 27621)...","[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859...","[0.12779036, 0.83761936, -0.8175876, 0.9235396...","[-0.22568002, 0.342005, 0.248815, -0.577975, -..."
3,Good,3,neutral,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"(0, 1571)\t1\n (0, 29353)\t1\n (0, 4536)\t...","(0, 7121)\t0.43645434017309587\n (0, 31200)...","[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535...","[-0.16519807, 0.1276928, 0.13958092, -0.040613...","[-0.21259494, -0.062381856, 0.21229614, 0.0178..."


In [21]:
# Path to the GloVe embeddings file
glove_file = '../glove.twitter.27B.100d.txt'

# Load the GloVe embeddings into a dictionary
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe embeddings
glove_twitter = load_glove_embeddings(glove_file)
print(f"Loaded {len(glove_twitter)} word vectors from GloVe.")

# Define a function to get the average GloVe vector for a list of tokens
def get_average_glove(tokens_list, embeddings, embedding_dim):
    valid_tokens = [token for token in tokens_list if token in embeddings]
    if not valid_tokens:
        return np.zeros(embedding_dim)
    word_vectors = [embeddings[token] for token in valid_tokens]
    average_vector = np.mean(word_vectors, axis=0)
    return average_vector

# Define the embedding dimension (e.g., 100 for 'glove.6B.100d.txt')
embedding_dim = 100

# Compute the average GloVe vector for each row
df['glove_twitter'] = df['tokens'].apply(lambda x: get_average_glove(x, glove_twitter, embedding_dim))

df.head()

Loaded 1193514 word vectors from GloVe.


Unnamed: 0,Content,Score,Sentiment,Content_cleaned,bow,tfidf,tokens,word2vec_pretrained,word2vec_cbow,glove_6B,glove_twitter
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"(0, 20804)\t1\n (0, 26026)\t1\n (0, 11312)...","(0, 8867)\t0.18773482827989177\n (0, 18672)...","[plss, stopp, give, screen, limit, like, ur, w...","[0.08365452, 0.0579847, 0.11433671, -0.0025425...","[0.20826791, 0.0046617766, -0.04649164, -0.117...","[-0.1198448, 0.12636456, 0.41294017, -0.217294...","[0.123258926, 0.09078163, -0.101420276, 0.2712..."
1,Good,5,positive,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
2,👍👍,5,positive,thumb up thumb up,"(0, 27621)\t2\n (0, 29212)\t2","(0, 29212)\t0.5498571095671961\n (0, 27621)...","[thumb, up, thumb, up]","[0.08703613, 0.07147217, -0.00390625, 0.005859...","[0.12779036, 0.83761936, -0.8175876, 0.9235396...","[-0.22568002, 0.342005, 0.248815, -0.577975, -...","[0.26894343, -0.28983998, 0.164455, -0.166473,..."
3,Good,3,neutral,good,"(0, 11478)\t1","(0, 11478)\t1.0",[good],"[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"(0, 1571)\t1\n (0, 29353)\t1\n (0, 4536)\t...","(0, 7121)\t0.43645434017309587\n (0, 31200)...","[app, useful, certain, phone, brand, except, p...","[0.0644662, -0.0806833, -0.0020926339, 0.02535...","[-0.16519807, 0.1276928, 0.13958092, -0.040613...","[-0.21259494, -0.062381856, 0.21229614, 0.0178...","[0.30852813, 0.06642222, -0.07303124, 0.210921..."


In [22]:
from scipy import spatial

def find_closest_embeddings(embeddings_dict, word, top_n=5):
    if word not in embeddings_dict:
        print(f"Word '{word}' not found in the embedding dictionary.")
        return []
    
    embedding = embeddings_dict[word]
    closest_words = sorted(
        embeddings_dict.keys(), 
        key=lambda w: spatial.distance.cosine(embeddings_dict[w], embedding) if len(embeddings_dict[w]) == len(embedding) else float('inf')
    )
    closest_words.remove(word)  # Remove the word itself from the results
    return closest_words[:top_n]


similar_words_6b = find_closest_embeddings(glove_6b, 'movie')
print(f"Words similar to 'movie' in GloVe 6B: {similar_words_6b}")

# Test with GloVe Twitter
similar_words_twitter = find_closest_embeddings(glove_twitter, 'movie')
print(f"Words similar to 'movie' in GloVe Twitter: {similar_words_twitter}")

Words similar to 'movie' in GloVe 6B: ['film', 'movies', 'films', 'hollywood', 'comedy']
Words similar to 'movie' in GloVe Twitter: ['movies', 'episode', 'story', 'trailer', 'watching']


When calculating the vectors of a sequence, we calculate the word embedding of every token seperately and then we average over them to get the sequence vector. This way the overall semantic meaning of the text is captured, while each word contributes to the final vector, allowing the resulting vector to represent the combined meanings of the individual words.

In the above example of the word "movie" we see what similar words the 2 different embeddings are giving. For both of the embeddings, we see similar words.

Positive:
- Sequences have a fixed size.
- The dimensionality is exceptionally reduced, compared to the frequency-based methods.
- Semantics and context, both local and global, are the base element of the vectors.
- Big range of embedding sizes.

Negative:
- Order of words is still not preserved.
- Again we work on a finite vocabulary, based on the corpus our model was trained on. If we have a new sequence with new words, they will not be taken into consideration, possibly losing important information.

Finally, we drop the tokens column from our dataset and we have the dataframe with all the different vectorizations.

In [23]:
df = df.drop(columns=['tokens'])
df.head()
#df.to_csv('vectorized_text.csv', index=False)

Unnamed: 0,Content,Score,Sentiment,Content_cleaned,bow,tfidf,word2vec_pretrained,word2vec_cbow,glove_6B,glove_twitter
0,Plsssss stoppppp giving screen limit like when...,2,negative,plss stopp give screen limit like ur watch thi...,"(0, 20804)\t1\n (0, 26026)\t1\n (0, 11312)...","(0, 8867)\t0.18773482827989177\n (0, 18672)...","[0.08365452, 0.0579847, 0.11433671, -0.0025425...","[0.20826791, 0.0046617766, -0.04649164, -0.117...","[-0.1198448, 0.12636456, 0.41294017, -0.217294...","[0.123258926, 0.09078163, -0.101420276, 0.2712..."
1,Good,5,positive,good,"(0, 11478)\t1","(0, 11478)\t1.0","[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
2,👍👍,5,positive,thumb up thumb up,"(0, 27621)\t2\n (0, 29212)\t2","(0, 29212)\t0.5498571095671961\n (0, 27621)...","[0.08703613, 0.07147217, -0.00390625, 0.005859...","[0.12779036, 0.83761936, -0.8175876, 0.9235396...","[-0.22568002, 0.342005, 0.248815, -0.577975, -...","[0.26894343, -0.28983998, 0.164455, -0.166473,..."
3,Good,3,neutral,good,"(0, 11478)\t1","(0, 11478)\t1.0","[0.040527344, 0.0625, -0.017456055, 0.07861328...","[0.1703951, -0.9856324, -0.18964884, -0.521374...","[-0.030769, 0.11993, 0.53909, -0.43696, -0.739...","[0.091552, 0.093336, -0.028113, 0.3699, 0.1895..."
4,"App is useful to certain phone brand ,,,,it is...",1,negative,app useful certain phone brand except phone tr...,"(0, 1571)\t1\n (0, 29353)\t1\n (0, 4536)\t...","(0, 7121)\t0.43645434017309587\n (0, 31200)...","[0.0644662, -0.0806833, -0.0020926339, 0.02535...","[-0.16519807, 0.1276928, 0.13958092, -0.040613...","[-0.21259494, -0.062381856, 0.21229614, 0.0178...","[0.30852813, 0.06642222, -0.07303124, 0.210921..."
