The objective of this notebook is to create files with the vectorized representation of the words for three different types: BOW, TF-IDF, word2vec to save time and resources (my pc)

### Libraries

In [32]:
import pandas as pd
import numpy as np  
from tqdm.notebook import tqdm
tqdm.pandas()
import pickle
import os

In [2]:
df = pd.read_csv("../../data/silver/df_preprocessed.csv")
df

Unnamed: 0,Category,Message,word_count,char_count
0,0,go jurong point crazy available bugis n great ...,20,111
1,0,ok lar joking wif u oni,6,29
2,1,free entry number wkly comp win fa cup final t...,28,155
3,0,u dun say early hor u c already say,11,49
4,0,nah dont think go usf life around though,13,61
...,...,...,...,...
5149,1,numbernd time tried number contact u u poundnu...,30,160
5150,0,b going esplanade fr home,8,36
5151,0,pity mood soany suggestion,10,57
5152,0,guy bitching acted like id interested buying s...,26,125


## BOW

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer(ngram_range= (1,3), max_features = 2000) # the dataset is small so 2000 should be enough 
		
bow_matrix = cv.fit_transform(df['Message'])

vocab = cv.get_feature_names_out()
print(vocab)

['abiola' 'able' 'abt' ... 'yr' 'yun' 'yup']


In [19]:
print(bow_matrix)

  (0, 620)	1
  (0, 1339)	1
  (0, 348)	1
  (0, 90)	1
  (0, 173)	1
  (0, 650)	1
  (0, 1953)	1
  (0, 851)	1
  (0, 271)	1
  (0, 646)	1
  (0, 1880)	1
  (1, 1236)	1
  (1, 860)	1
  (1, 827)	1
  (1, 1925)	1
  (2, 570)	1
  (2, 498)	2
  (2, 1122)	3
  (2, 1942)	1
  (2, 312)	1
  (2, 1929)	1
  (2, 351)	1
  (2, 545)	1
  (2, 1225)	1
  (2, 986)	1
  :	:
  (5149, 1132)	1
  (5149, 278)	1
  (5149, 465)	1
  (5149, 1019)	1
  (5149, 1216)	1
  (5149, 1288)	1
  (5149, 327)	1
  (5150, 727)	1
  (5150, 629)	1
  (5150, 569)	1
  (5151, 1053)	1
  (5152, 570)	1
  (5152, 1896)	1
  (5152, 755)	1
  (5152, 901)	1
  (5152, 1099)	1
  (5152, 1576)	1
  (5152, 479)	1
  (5152, 602)	1
  (5152, 177)	1
  (5152, 666)	1
  (5152, 809)	1
  (5152, 1100)	1
  (5153, 1080)	1
  (5153, 1767)	1


In [20]:
with open('../../data/gold/bow/count_vectorizer.pkl', 'wb') as f:
    pickle.dump(cv, f)

with open('../../data/gold/bow/bow_matrix.pkl', 'wb') as f:
    pickle.dump(bow_matrix, f)

with open('../../data/gold/bow/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

## TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=2000)

In [22]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Message'])
print(tfidf_matrix)

  (0, 620)	0.191494314053214
  (0, 1339)	0.2955977793507492
  (0, 348)	0.33411339170167403
  (0, 90)	0.33411339170167403
  (0, 173)	0.3648205024014347
  (0, 650)	0.24292826678909346
  (0, 1953)	0.30540040800912066
  (0, 851)	0.3648205024014347
  (0, 271)	0.3648205024014347
  (0, 646)	0.20012025836823058
  (0, 1880)	0.24713295360076712
  (1, 1236)	0.33241835942254533
  (1, 860)	0.4855196293639528
  (1, 827)	0.6244913682218861
  (1, 1925)	0.5135944458787206
  (2, 570)	0.13978181638107978
  (2, 498)	0.4323670964433752
  (2, 1122)	0.2475453579166414
  (2, 1942)	0.2371970716136136
  (2, 312)	0.2371970716136136
  (2, 1929)	0.1813856717213151
  (2, 351)	0.2445025228016729
  (2, 545)	0.22392263218527264
  (2, 1225)	0.20233972407795353
  (2, 986)	0.1902141322075903
  :	:
  (5149, 1132)	0.20453228564135612
  (5149, 278)	0.20453228564135612
  (5149, 465)	0.2013065856494782
  (5149, 1019)	0.2124109759264187
  (5149, 1216)	0.2124109759264187
  (5149, 1288)	0.2124109759264187
  (5149, 327)	0.2124109

In [23]:
vocab = tfidf_vectorizer.get_feature_names_out()
vocab

array(['abiola', 'able', 'abt', ..., 'yr', 'yun', 'yup'], dtype=object)

In [24]:
with open('../../data/gold/tfidf/tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)

with open('../../data/gold/tfidf/tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

## Word2Vec

CountVectorizer and TfidfVectorizer tokenize the text already but word2vec no so we do it manually

In [None]:
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to /home/maldu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
corpus = df['Message'].tolist() 

tokenized_corpus = [word_tokenize(text) for text in corpus]

In [30]:
model = Word2Vec(
    sentences=tokenized_corpus,  
    vector_size=100,  # general dimension of the vector = 100 is fine for a small dataset                  
    window=5,       # max number of words considered as context of each word. Let's try with 5 by default.                    
    min_count=1,         # min frequency of word appearance in the corpus. All words are included for now.               
    workers= os.cpu_count() - 1          # threads to accelerate the training. I have an Intel core i7 so I gonna use 6 to work in parallel              
)

In [35]:
def get_average_word2vec(tokens_list, model, vector_size=100):
    valid_words = [word for word in tokens_list if word in model.wv]
    if valid_words:
        word_vectors = [model.wv[word] for word in valid_words]
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(vector_size)

word2vec_avg = [get_average_word2vec(tokens, model) for tokens in tokenized_corpus]
word2vec_avg

[array([-0.09247642,  0.21926787, -0.02439525,  0.05310179,  0.01911096,
        -0.34112933,  0.06791642,  0.484566  , -0.18140234, -0.21175769,
        -0.16163583, -0.33087164, -0.06598108,  0.1214963 ,  0.05770594,
        -0.22958773,  0.07184739, -0.19683032, -0.01506171, -0.47878206,
         0.16916038,  0.10197994,  0.09482161, -0.15696962, -0.00477509,
        -0.02441635, -0.15749584, -0.14842169, -0.19149464, -0.03759189,
         0.22314139,  0.13387924,  0.06865367, -0.21169572, -0.04616764,
         0.22667834,  0.10151024, -0.15224117, -0.1809207 , -0.41232127,
         0.01257502, -0.1699447 , -0.06571644,  0.02083102,  0.22211622,
        -0.11967766, -0.1232561 , -0.01275479,  0.11843196,  0.07870629,
         0.10501518, -0.20895934, -0.12019571, -0.00236538, -0.17507096,
         0.11496824,  0.09657328, -0.1060903 , -0.19053975,  0.1164138 ,
         0.06978011,  0.12432709, -0.02915205, -0.07602463, -0.2019801 ,
         0.18738547,  0.05842023,  0.21457739, -0.2

In [36]:


with open('../../data/gold/word2vec_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open("../../data/gold/word2vec_avg.pkl", "wb") as f:
    pickle.dump(word2vec_avg, f)