In [None]:
!pip install gensim



In [None]:
import gensim
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
from nltk.stem import WordNetLemmatizer
import re
import nltk
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
wv = api.load('word2vec-google-news-300')

vec_king = wv['king']

In [None]:
vec_king

In [None]:
messages = pd.read_csv('/content/SMSSpamCollection.csv',
                    sep='\t',names=["label","message"])

In [None]:
messages

In [None]:
messages.shape

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

In [None]:
words = []
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [None]:
words

In [None]:
model = gensim.models.Word2Vec(words)


In [66]:
#To Get All the Vocabulary
model.wv.index_to_key

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [67]:
model.corpus_count

5569

In [68]:
model.epochs

5

In [69]:
model.wv.similar_by_word('good')

[('all', 0.998698890209198),
 ('did', 0.998610258102417),
 ('day', 0.9985825419425964),
 ('night', 0.9984840750694275),
 ('well', 0.9984728097915649),
 ('morning', 0.9984573125839233),
 ('happy', 0.998445987701416),
 ('where', 0.9984095096588135),
 ('not', 0.9983347654342651),
 ('thing', 0.9982860684394836)]

In [70]:
model.wv['good'].shape

(100,)

In [71]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [82]:
def avg_word_2_vec(doc, model, vector_size=100):
    """
    doc: list of tokens
    model: gensim Word2Vec model
    vector_size: dimension of embeddings
    """
    #
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        # return zero vector if doc has no known words
        return np.zeros(vector_size)

X = []
for i in tqdm(range(len(words))):
    X.append(avg_word_2_vec(words[i], model, vector_size=model.vector_size))

X_New = np.vstack(X)  # this ensures a proper 2D NumPy array


100%|██████████| 5569/5569 [00:00<00:00, 6608.32it/s]


In [83]:
X_New

array([[-0.17268461,  0.16004689,  0.06094725, ..., -0.22659102,
         0.15794179, -0.08324222],
       [-0.16024444,  0.14277247,  0.05348037, ..., -0.2077112 ,
         0.144483  , -0.08364917],
       [-0.1860953 ,  0.17813535,  0.06896932, ..., -0.26444229,
         0.18955745, -0.06264431],
       ...,
       [-0.23850845,  0.22047998,  0.07862368, ..., -0.29790747,
         0.2056367 , -0.11961544],
       [-0.21432175,  0.1966221 ,  0.07370853, ..., -0.27974331,
         0.18678641, -0.10554572],
       [-0.23596148,  0.20793068,  0.08145277, ..., -0.30734187,
         0.20214884, -0.11455157]])

In [84]:
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y = pd.get_dummies(y['label'])
y = y.iloc[:,0].values

In [85]:
y

array([ True,  True, False, ...,  True,  True,  True])

In [87]:
X[0].reshape(1,-1).shape

(1, 100)

In [89]:
import pandas as pd

dfs = []
for i in range(len(X)):
    dfs.append(pd.DataFrame(X[i].reshape(1, -1)))

df = pd.concat(dfs, ignore_index=True)


In [90]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.172685,0.160047,0.060947,0.026473,0.100024,-0.522003,0.143077,0.499449,-0.236194,-0.113846,...,0.341925,0.099916,0.032935,0.023022,0.473451,0.145668,0.117684,-0.226591,0.157942,-0.083242
1,-0.160244,0.142772,0.053480,0.026031,0.092599,-0.461546,0.117459,0.446520,-0.210171,-0.094877,...,0.309334,0.084228,0.026790,0.016079,0.411174,0.125783,0.106708,-0.207711,0.144483,-0.083649
2,-0.186095,0.178135,0.068969,0.032445,0.086707,-0.568107,0.138594,0.496196,-0.245894,-0.141751,...,0.340390,0.103737,0.021391,-0.006330,0.489754,0.132682,0.071067,-0.264442,0.189557,-0.062644
3,-0.235263,0.213820,0.076514,0.043473,0.133363,-0.697674,0.188056,0.675452,-0.322030,-0.144682,...,0.460185,0.130167,0.044031,0.039886,0.627414,0.198965,0.173620,-0.306396,0.209113,-0.123747
4,-0.211427,0.177326,0.072355,0.034069,0.122227,-0.600260,0.159187,0.584245,-0.279732,-0.131392,...,0.400794,0.107458,0.036058,0.035048,0.543581,0.172282,0.140013,-0.276538,0.176782,-0.102825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5564,-0.208424,0.207317,0.091573,0.035888,0.112628,-0.641430,0.173831,0.588146,-0.287091,-0.155019,...,0.407871,0.124332,0.040306,0.005965,0.576381,0.159589,0.103749,-0.291530,0.210456,-0.080908
5565,-0.221839,0.186774,0.070735,0.043098,0.119505,-0.632688,0.164609,0.598906,-0.297274,-0.135008,...,0.415139,0.120493,0.018632,0.021070,0.566270,0.172283,0.122981,-0.294570,0.195600,-0.090119
5566,-0.238508,0.220480,0.078624,0.026358,0.125329,-0.702383,0.197466,0.682841,-0.321827,-0.160492,...,0.464662,0.144775,0.055480,0.046682,0.642215,0.203855,0.175013,-0.297907,0.205637,-0.119615
5567,-0.214322,0.196622,0.073709,0.031029,0.115841,-0.640072,0.171235,0.611064,-0.288930,-0.148368,...,0.412071,0.123259,0.046041,0.031453,0.578274,0.179709,0.137887,-0.279743,0.186786,-0.105546


In [91]:
df['Output'] = y

In [92]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.172685,0.160047,0.060947,0.026473,0.100024,-0.522003,0.143077,0.499449,-0.236194,-0.113846,...,0.099916,0.032935,0.023022,0.473451,0.145668,0.117684,-0.226591,0.157942,-0.083242,True
1,-0.160244,0.142772,0.05348,0.026031,0.092599,-0.461546,0.117459,0.44652,-0.210171,-0.094877,...,0.084228,0.02679,0.016079,0.411174,0.125783,0.106708,-0.207711,0.144483,-0.083649,True
2,-0.186095,0.178135,0.068969,0.032445,0.086707,-0.568107,0.138594,0.496196,-0.245894,-0.141751,...,0.103737,0.021391,-0.00633,0.489754,0.132682,0.071067,-0.264442,0.189557,-0.062644,False
3,-0.235263,0.21382,0.076514,0.043473,0.133363,-0.697674,0.188056,0.675452,-0.32203,-0.144682,...,0.130167,0.044031,0.039886,0.627414,0.198965,0.17362,-0.306396,0.209113,-0.123747,True
4,-0.211427,0.177326,0.072355,0.034069,0.122227,-0.60026,0.159187,0.584245,-0.279732,-0.131392,...,0.107458,0.036058,0.035048,0.543581,0.172282,0.140013,-0.276538,0.176782,-0.102825,True


In [93]:
df.dropna(inplace=True)

In [98]:
X = df.drop(columns='Output', axis = 1)
y = df['Output']

In [99]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [100]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5403,-0.240557,0.19031,0.079198,0.026915,0.131924,-0.638425,0.167194,0.623642,-0.299583,-0.139811,...,0.44392,0.113861,0.036062,0.04794,0.573712,0.210825,0.163027,-0.282901,0.178022,-0.109391
309,-0.205453,0.187167,0.071096,0.026093,0.105095,-0.602969,0.148759,0.549948,-0.266941,-0.14236,...,0.380042,0.112099,0.027949,0.009943,0.529917,0.15449,0.101924,-0.277075,0.192014,-0.082057
1311,-0.236333,0.212511,0.075454,0.035373,0.129273,-0.697759,0.187458,0.679845,-0.316605,-0.157372,...,0.459529,0.129882,0.043706,0.034645,0.629837,0.198435,0.153972,-0.308118,0.195833,-0.124083
1661,-0.20826,0.164982,0.065801,0.030092,0.115387,-0.567664,0.144751,0.546517,-0.265574,-0.126209,...,0.378451,0.100107,0.025504,0.026487,0.509187,0.164821,0.123849,-0.25806,0.165913,-0.088665
3967,-0.115605,0.280363,0.103285,0.001861,0.049781,-0.676926,0.273099,0.67883,-0.311512,-0.187306,...,0.389798,0.187445,0.156829,0.072633,0.645559,0.184337,0.212715,-0.19116,0.19829,-0.159099


In [101]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

In [102]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9640933572710951
              precision    recall  f1-score   support

       False       0.92      0.81      0.86       155
        True       0.97      0.99      0.98       959

    accuracy                           0.96      1114
   macro avg       0.95      0.90      0.92      1114
weighted avg       0.96      0.96      0.96      1114

