## Word2Vec
* Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality'

In [1]:
!pip install gensim



In [1]:
from gensim.models import Word2Vec, KeyedVectors

In [3]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')
# vec_King = wv['king']

### Instead of loading again and again from api, I stored the model in my local and i can load and from there and use in less time

In [5]:
# Save the model locally
wv.save("C:\\Users\\spurusho\\Downloads\\word2vec-google-news-300.model")


In [7]:
from gensim.models import KeyedVectors

# Load the saved model from the local file system
wv = KeyedVectors.load("C:\\Users\\spurusho\\Downloads\\word2vec-google-news-300.model")


In [9]:
wv.most_similar('happy')

[('glad', 0.7408890724182129),
 ('pleased', 0.6632170677185059),
 ('ecstatic', 0.6626912355422974),
 ('overjoyed', 0.6599286794662476),
 ('thrilled', 0.6514049172401428),
 ('satisfied', 0.6437949538230896),
 ('proud', 0.636042058467865),
 ('delighted', 0.627237856388092),
 ('disappointed', 0.6269949674606323),
 ('excited', 0.6247665286064148)]

In [10]:
import pandas as pd
messages = pd.read_csv("..\\smsSpamCollection\\SMSSpamCollection.txt", sep='\t', names=['label','message'])

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [12]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spurusho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)
    

In [14]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

### Just preprocessing of the data is done till now

In [16]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [17]:
simple_preprocess(corpus[0])

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [18]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [19]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [20]:
wv.index_to_key

['</s>',
 'in',
 'for',
 'that',
 'is',
 'on',
 '##',
 'The',
 'with',
 'said',
 'was',
 'the',
 'at',
 'not',
 'as',
 'it',
 'be',
 'from',
 'by',
 'are',
 'I',
 'have',
 'he',
 'will',
 'has',
 '####',
 'his',
 'an',
 'this',
 'or',
 'their',
 'who',
 'they',
 'but',
 '$',
 'had',
 'year',
 'were',
 'we',
 'more',
 '###',
 'up',
 'been',
 'you',
 'its',
 'one',
 'about',
 'would',
 'which',
 'out',
 'can',
 'It',
 'all',
 'also',
 'two',
 'after',
 'first',
 'He',
 'do',
 'time',
 'than',
 'when',
 'We',
 'over',
 'last',
 'new',
 'other',
 'her',
 'people',
 'into',
 'In',
 'our',
 'there',
 'A',
 'she',
 'could',
 'just',
 'years',
 'some',
 'U.S.',
 'three',
 'million',
 'them',
 'what',
 'But',
 'so',
 'no',
 'like',
 'if',
 'only',
 'percent',
 'get',
 'did',
 'him',
 'game',
 'back',
 'because',
 'now',
 '#.#',
 'before',
 'company',
 'any',
 'team',
 'against',
 'off',
 'This',
 'most',
 'made',
 'through',
 'make',
 'second',
 'state',
 'well',
 'day',
 'season',
 'says',
 'w

In [21]:
len(words)

5569

In [22]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [24]:
import numpy as np

In [28]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([wv[word] for word in doc if word in wv.index_to_key],axis=0)

In [25]:
!pip install tqdm



In [26]:
from tqdm import tqdm

In [29]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [04:47<00:00, 19.36it/s]


In [31]:
len(X)

5569

In [32]:
X[0]

array([ 0.00416946,  0.03563309,  0.03923035,  0.1819191 , -0.02314663,
        0.00548172,  0.08302689, -0.13270569,  0.06464005,  0.0870105 ,
       -0.07676697, -0.16712952, -0.0173912 , -0.0344162 , -0.11128235,
        0.11889648,  0.1491394 ,  0.07105255,  0.03854752, -0.05666542,
       -0.06646839, -0.00803041,  0.06407928,  0.02147675,  0.0188179 ,
        0.00799561, -0.12144899,  0.02794647,  0.05820036, -0.01321411,
       -0.04854202,  0.02028847, -0.03303909, -0.00585175, -0.01535797,
       -0.04593349, -0.0073806 , -0.0299654 ,  0.03400135,  0.03598022,
        0.07830048, -0.07444286,  0.13977051,  0.02076912,  0.01213837,
       -0.05368423, -0.05954361, -0.03142369,  0.04148102,  0.08683777,
       -0.03732395,  0.10115051, -0.03102303,  0.001091  , -0.0726366 ,
        0.03613997, -0.05967331, -0.1007309 ,  0.04097748, -0.08646393,
       -0.0866785 ,  0.08569336, -0.06284332, -0.10270691,  0.01132202,
       -0.08794498, -0.04538727,  0.0563736 , -0.02664852,  0.10

### Finally we acheived word to vector form

In [37]:
X

[array([ 0.00416946,  0.03563309,  0.03923035,  0.1819191 , -0.02314663,
         0.00548172,  0.08302689, -0.13270569,  0.06464005,  0.0870105 ,
        -0.07676697, -0.16712952, -0.0173912 , -0.0344162 , -0.11128235,
         0.11889648,  0.1491394 ,  0.07105255,  0.03854752, -0.05666542,
        -0.06646839, -0.00803041,  0.06407928,  0.02147675,  0.0188179 ,
         0.00799561, -0.12144899,  0.02794647,  0.05820036, -0.01321411,
        -0.04854202,  0.02028847, -0.03303909, -0.00585175, -0.01535797,
        -0.04593349, -0.0073806 , -0.0299654 ,  0.03400135,  0.03598022,
         0.07830048, -0.07444286,  0.13977051,  0.02076912,  0.01213837,
        -0.05368423, -0.05954361, -0.03142369,  0.04148102,  0.08683777,
        -0.03732395,  0.10115051, -0.03102303,  0.001091  , -0.0726366 ,
         0.03613997, -0.05967331, -0.1007309 ,  0.04097748, -0.08646393,
        -0.0866785 ,  0.08569336, -0.06284332, -0.10270691,  0.01132202,
        -0.08794498, -0.04538727,  0.0563736 , -0.0

In [39]:
type(X)
# lets convert it to np array
##independent Features
X_new=np.array(X, dtype=object)

In [41]:
X_new.shape

(5569,)

### The X_new have the data of 5569 rows but now columns, we have to reshape each and every data point

In [42]:
X_new[0].shape

(300,)

### We should make it 1 row 300 col eg. (1, 300)

In [45]:
X_new[0].reshape(1, -1).shape

(1, 300)

In [48]:
len(X)

5569

In [53]:
X[783].shape

()

In [67]:
# lets do it for each row
## this is the final independent features
data = []  # Create an empty list to store reshaped arrays
y = []
for i in range(len(X)):
    if X[i].shape !=():
        data.append(X[i].reshape(1, -1))  # Append reshaped arrays to the list
        y.append(messages['label'][i])

df = pd.DataFrame(np.vstack(data))  # Convert the list to a DataFrame at once


In [68]:
df.shape

(5563, 300)

In [69]:
len(y)

5563

In [70]:
y = np.array(y)
y.shape

(5563,)

In [71]:
y

array(['ham', 'ham', 'spam', ..., 'spam', 'spam', 'ham'], dtype='<U4')

In [72]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [73]:
y

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [78]:
df.isnull().sum()

0      0
1      0
2      0
3      0
4      0
      ..
295    0
296    0
297    0
298    0
299    0
Length: 300, dtype: int64

### Ok now all set to go from train_test_split the dataset

In [80]:
X = df

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [82]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [83]:
classifier.fit(X_train,y_train)

In [84]:
y_pred=classifier.predict(X_test)

In [85]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.8472596585804133


In [86]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.97      0.92       957
           1       0.29      0.06      0.11       156

    accuracy                           0.85      1113
   macro avg       0.58      0.52      0.51      1113
weighted avg       0.78      0.85      0.80      1113

