In [1]:
!pip install gensim



In [2]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [4]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

vec_king = wv['king']



In [54]:
import pandas as pd
messages = pd.read_csv("SMSSpamCollection.txt", sep = '\t', names = ["label", "messages"])

In [55]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [56]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [57]:
import re

corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['messages'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [58]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [59]:
# Dependent Features
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 0].values

In [60]:
# Splitting the dataset into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.2, random_state = 0)

In [61]:
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess

In [14]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [62]:
words = []

for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))


In [63]:
# Training the Word2Vec Model
model = gensim.models.Word2Vec(words)

In [17]:
# Getting all the vocabulary
model.wv.index_to_key

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [18]:
# Checking the total number of vocabulary
model.corpus_count

5569

In [19]:
model.epochs

5

In [22]:
# Checking for similarity by word
model.wv.similar_by_word('kid')

[('stuff', 0.9974166750907898),
 ('down', 0.9973229169845581),
 ('doe', 0.9972323775291443),
 ('abt', 0.9972049593925476),
 ('went', 0.9971786141395569),
 ('haha', 0.9971651434898376),
 ('even', 0.9971462488174438),
 ('best', 0.9971432685852051),
 ('eat', 0.9971388578414917),
 ('dat', 0.9971314668655396)]

In [23]:
model.wv.cosine_similarities

In [25]:
model.wv['good'].shape

(100,)

In [45]:
import numpy as np

def avg_word2vec(doc):
  # Filter out words not in the model's vocabulary
  word_vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]

  # Handle cases where no words from the document are in the vocabulary
  if not word_vectors:
    return np.zeros(model.vector_size) # Return a zero vector of the correct dimension
  else:
    return np.mean(word_vectors, axis = 0)

In [27]:
!pip install tqdm



In [46]:
from tqdm import tqdm

In [65]:
# Applying for the entire sentence
X_train = []
for i in tqdm(range(len(words))):
  X_train.append(avg_word2vec(words[i]))

100%|██████████| 5569/5569 [00:00<00:00, 7599.12it/s]


In [66]:
X_train

[array([-0.17083229,  0.23449802,  0.1288175 ,  0.08608662,  0.09821726,
        -0.5054944 ,  0.18064025,  0.45759478, -0.29838848, -0.13513835,
        -0.15023915, -0.36482316, -0.04860174,  0.10183544,  0.21124502,
        -0.1551855 ,  0.11506622, -0.30562016, -0.07421642, -0.5071162 ,
         0.20334572,  0.11420473,  0.07860596, -0.22613852, -0.04004314,
        -0.02470913, -0.21841279, -0.19238637, -0.2651412 ,  0.03151863,
         0.3045282 ,  0.01782894,  0.10501327, -0.1923128 , -0.12781893,
         0.40362817,  0.05581732, -0.1313614 , -0.12578249, -0.4852803 ,
         0.12662035, -0.25512594, -0.16949525,  0.00810995,  0.14293617,
         0.01147992, -0.12002583, -0.01420337,  0.20208415,  0.1368745 ,
         0.16642843, -0.1886221 , -0.05632623,  0.06069094, -0.05562893,
         0.04393887,  0.14085096,  0.01200528, -0.35866916,  0.17968644,
         0.00725678,  0.15046184,  0.00806006, -0.10612639, -0.30236906,
         0.28838527,  0.10038234,  0.22598657, -0.3

In [49]:
len(X_train)

5569

In [50]:
X_new = np.array(X)

In [51]:
X_new.shape

(5569, 100)

ValueError: Found input variables with inconsistent numbers of samples: [5569, 5572]