## Project

In [1]:
# importing libraries and dataset
import pandas as pd

# Tab seperated data set
messages = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', names=["label","message"])

In [2]:
print(messages.shape)
messages.head()

(5572, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Data cleaning and Preprocessing
import re
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Stemming
ps = PorterStemmer()

In [5]:
messages['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [6]:
# data cleaning and stemming
corpus = []

for i in range(len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [7]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

# Create Bag Of Words

In [9]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer

## for Binary BOW enable binary=True
cv = CountVectorizer(max_features = 2500, binary = True, ngram_range = (1,2))

## independent features
X = cv.fit_transform(corpus).toarray()

In [10]:
print(X.shape)
X[1]

(5572, 2500)


array([0, 0, 0, ..., 0, 0, 0])

In [11]:
cv.vocabulary_

{'go': 802,
 'point': 1615,
 'crazi': 451,
 'avail': 122,
 'bugi': 229,
 'great': 860,
 'world': 2428,
 'la': 1083,
 'cine': 339,
 'got': 848,
 'wat': 2336,
 'ok': 1478,
 'lar': 1093,
 'joke': 1054,
 'wif': 2393,
 'free': 716,
 'entri': 614,
 'wkli': 2417,
 'comp': 395,
 'win': 2398,
 'cup': 466,
 'final': 678,
 'st': 1975,
 'may': 1265,
 'text': 2077,
 'receiv': 1701,
 'question': 1669,
 'std': 1988,
 'txt': 2196,
 'rate': 1681,
 'appli': 86,
 'free entri': 723,
 'entri wkli': 616,
 'std txt': 1989,
 'txt rate': 2204,
 'rate appli': 1682,
 'dun': 579,
 'say': 1798,
 'earli': 585,
 'alreadi': 55,
 'nah': 1389,
 'think': 2096,
 'goe': 823,
 'usf': 2271,
 'live': 1162,
 'around': 98,
 'though': 2107,
 'freemsg': 736,
 'hey': 918,
 'darl': 486,
 'week': 2362,
 'word': 2423,
 'back': 139,
 'like': 1148,
 'fun': 753,
 'still': 1990,
 'tb': 2057,
 'xxx': 2464,
 'send': 1827,
 'even': 623,
 'brother': 219,
 'speak': 1958,
 'treat': 2174,
 'per': 1546,
 'request': 1737,
 'set': 1849,
 'callert

In [12]:
## Output Features
y = pd.get_dummies(messages['label'])

y = y.iloc[:,0].values

y.shape

(5572,)

In [13]:
## Train Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

In [14]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X_train,y_train)


In [15]:
y_pred = spam_detect_model.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score,classification_report

In [17]:
accuracy_score(y_test,y_pred)

0.9802690582959641

In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       142
           1       0.98      0.99      0.99       973

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



# Creating The TF-IDF Model

In [19]:
## Train Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(corpus, y, test_size=0.20)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_features = 2500, ngram_range = (1,2))

X_train = tv.fit_transform(X_train).toarray()
X_test = tv.transform(X_test).toarray()

In [21]:
tv.vocabulary_

{'lol': 1196,
 'differ': 523,
 'go': 809,
 'tri': 2167,
 'find': 694,
 'everi': 635,
 'real': 1699,
 'life': 1165,
 'photo': 1559,
 'ever': 634,
 'took': 2153,
 'hi': 946,
 'girl': 799,
 'wait': 2313,
 'text': 2072,
 'great': 866,
 'night': 1432,
 'chat': 311,
 'send': 1831,
 'stop': 1985,
 'servic': 1847,
 'chat send': 313,
 'send stop': 1838,
 'stop stop': 1991,
 'thank': 2083,
 'rington': 1758,
 'order': 1509,
 'refer': 1715,
 'number': 1461,
 'mobil': 1336,
 'charg': 306,
 'tone': 2146,
 'arriv': 92,
 'pleas': 1582,
 'call': 225,
 'custom': 460,
 'thank rington': 2086,
 'rington order': 1760,
 'refer number': 1716,
 'number mobil': 1464,
 'mobil charg': 1337,
 'charg tone': 308,
 'tone arriv': 2147,
 'arriv pleas': 93,
 'pleas call': 1583,
 'call custom': 230,
 'custom servic': 462,
 'mth': 1381,
 'updat': 2233,
 'latest': 1129,
 'orang': 1506,
 'camera': 262,
 'video': 2285,
 'phone': 1554,
 'free': 732,
 'save': 1799,
 'weekend': 2368,
 'ye': 2470,
 'opt': 1504,
 'mobil mth': 134

In [22]:
from sklearn.naive_bayes import MultinomialNB

spam_tfidf_model = MultinomialNB().fit(X_train, y_train)

In [23]:
#prediction
y_pred = spam_tfidf_model.predict(X_test)

In [24]:
score = accuracy_score(y_test,y_pred)
print(score)

0.9811659192825112


In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93       132
           1       1.00      0.98      0.99       983

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [26]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [27]:
#prediction
y_pred = classifier.predict(X_test)

In [28]:
score = accuracy_score(y_test,y_pred)
print(score)

0.9829596412556054


In [29]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       132
           1       1.00      0.98      0.99       983

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.99      0.98      0.98      1115



# Word2vec Implementation

In [30]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [None]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



In [31]:
# Lemmatization

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [33]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [34]:
corpus = []

for i in range(len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review = review.lower()
  review = review.split()
  review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words("english"))]
  review = ' '.join(review)
  corpus.append(review)

In [35]:
from nltk import sent_tokenize

from gensim.utils import simple_preprocess

In [36]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
words = []

for sent in corpus:
  sent_token = sent_tokenize(sent)
  for sent in sent_token:
    words.append(simple_preprocess(sent))

In [41]:
print(len(words))

5564


In [42]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [43]:
# Lets create word2vec from scratch

In [44]:
import gensim

In [45]:
## Lets train Word2vec from scratch
model = gensim.models.Word2Vec(words, window = 5, min_count = 2)

In [46]:
## To Get All the Vocabulary
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [47]:
model.corpus_count

5564

In [48]:
model.epochs

5

In [49]:
model.wv.similar_by_word('good')

[('give', 0.9997060298919678),
 ('day', 0.9996912479400635),
 ('much', 0.9996862411499023),
 ('got', 0.9996612668037415),
 ('like', 0.9996504783630371),
 ('night', 0.999642014503479),
 ('one', 0.9996417164802551),
 ('going', 0.999639093875885),
 ('go', 0.9996389746665955),
 ('get', 0.999637246131897)]

In [50]:
model.wv['good'].shape

(100,)

In [51]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [52]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)

    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [53]:
from tqdm import tqdm

In [77]:
#apply for the entire sentences
import numpy as np
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5564/5564 [00:00<00:00, 6582.85it/s]


In [78]:
len(X)

5564

In [79]:
##independent Features
X_new = np.array(X)

  X_new = np.array(X)


In [80]:
messages.shape

(5572, 2)

In [81]:
X[1]

array([-0.06000417,  0.17480853,  0.08391489,  0.03008646,  0.03148688,
       -0.20819724,  0.03857632,  0.35904008, -0.10235371, -0.09693635,
       -0.10715   , -0.2468966 , -0.02655935,  0.09084232,  0.04891178,
       -0.18516213,  0.01367468, -0.23612802, -0.01435464, -0.28996032,
        0.06287982,  0.10808828,  0.06650194, -0.07178103, -0.06503949,
        0.00754649, -0.13274163, -0.10457119, -0.16845009,  0.02926088,
        0.18024585,  0.04043346,  0.0943795 , -0.18530585, -0.07126309,
        0.17923017,  0.02775534, -0.13969597, -0.09765707, -0.25921217,
        0.03925882, -0.1618865 , -0.06941628,  0.05183046,  0.16971016,
       -0.09624703, -0.12065281,  0.019271  ,  0.10974934,  0.14762606,
        0.11005421, -0.19650471, -0.02098496, -0.02440118, -0.09780005,
        0.13815211,  0.09586562, -0.04781207, -0.19507553,  0.03682303,
        0.06639253,  0.09631218, -0.0834561 ,  0.00287221, -0.1776041 ,
        0.11655562,  0.06398328,  0.1431535 , -0.1784984 ,  0.22

In [82]:
X_new.shape

(5564,)

In [83]:
X_new[0].shape

(100,)

In [84]:
## Dependent Features
## Output Features
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y = pd.get_dummies(y['label'])
y = y.iloc[:,0].values

In [85]:
y.shape

(5564,)

In [86]:
X[0].reshape(1,-1).shape

(1, 100)

In [87]:
## this is the final independent features
df = pd.DataFrame()
for i in range(0,len(X)):
    df = df.append(pd.DataFrame(X[i].reshape(1,-1)),ignore_index=True)


  df = df.append(pd.DataFrame(X[i].reshape(1,-1)),ignore_index=True)


In [88]:
print(df.shape)
df.head()

(5564, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.092711,0.254042,0.128723,0.046018,0.048242,-0.310111,0.054843,0.53338,-0.147772,-0.14308,...,0.368055,0.147404,0.09382,0.147857,0.330351,0.294077,0.1207,-0.292646,0.087633,-0.077827
1,-0.060004,0.174809,0.083915,0.030086,0.031487,-0.208197,0.038576,0.35904,-0.102354,-0.096936,...,0.246665,0.103528,0.065869,0.097159,0.223594,0.199411,0.083386,-0.197747,0.058268,-0.046604
2,-0.092755,0.245701,0.126208,0.053259,0.052615,-0.294078,0.05123,0.513839,-0.140817,-0.132156,...,0.358745,0.144456,0.096623,0.13744,0.322882,0.286104,0.110742,-0.281927,0.085249,-0.074802
3,-0.114003,0.311637,0.160702,0.06231,0.062999,-0.383238,0.066432,0.662414,-0.181954,-0.177855,...,0.452389,0.178611,0.117638,0.18044,0.40408,0.366662,0.144482,-0.357207,0.106561,-0.095669
4,-0.112336,0.303542,0.155837,0.051323,0.05479,-0.365751,0.066892,0.633269,-0.176314,-0.16768,...,0.433445,0.175359,0.115749,0.169817,0.393709,0.348526,0.145418,-0.34566,0.104364,-0.092157


In [89]:
df['Output'] = y
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.092711,0.254042,0.128723,0.046018,0.048242,-0.310111,0.054843,0.53338,-0.147772,-0.14308,...,0.147404,0.09382,0.147857,0.330351,0.294077,0.1207,-0.292646,0.087633,-0.077827,1
1,-0.060004,0.174809,0.083915,0.030086,0.031487,-0.208197,0.038576,0.35904,-0.102354,-0.096936,...,0.103528,0.065869,0.097159,0.223594,0.199411,0.083386,-0.197747,0.058268,-0.046604,1
2,-0.092755,0.245701,0.126208,0.053259,0.052615,-0.294078,0.05123,0.513839,-0.140817,-0.132156,...,0.144456,0.096623,0.13744,0.322882,0.286104,0.110742,-0.281927,0.085249,-0.074802,0
3,-0.114003,0.311637,0.160702,0.06231,0.062999,-0.383238,0.066432,0.662414,-0.181954,-0.177855,...,0.178611,0.117638,0.18044,0.40408,0.366662,0.144482,-0.357207,0.106561,-0.095669,1
4,-0.112336,0.303542,0.155837,0.051323,0.05479,-0.365751,0.066892,0.633269,-0.176314,-0.16768,...,0.175359,0.115749,0.169817,0.393709,0.348526,0.145418,-0.34566,0.104364,-0.092157,1


In [92]:
y = df['Output']

In [91]:
df.shape

(5564, 101)

In [106]:
## Independent Feature
X = df.drop(['Output'], axis=1)
X.shape

(5564, 100)

In [104]:
df.isnull().sum()

0         23
1         23
2         23
3         23
4         23
          ..
96        23
97        23
98        23
99        23
Output     0
Length: 101, dtype: int64

In [107]:
X.isnull().sum()

0     23
1     23
2     23
3     23
4     23
      ..
95    23
96    23
97    23
98    23
99    23
Length: 100, dtype: int64

In [108]:
## Train Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

In [109]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2777,-0.135224,0.382656,0.196717,0.070709,0.062414,-0.468889,0.082317,0.805238,-0.230562,-0.214858,...,0.556438,0.21797,0.141251,0.217492,0.494665,0.445437,0.18795,-0.441202,0.136054,-0.121135
665,-0.06337,0.163429,0.084648,0.031704,0.031279,-0.197475,0.03029,0.345246,-0.095651,-0.091257,...,0.236726,0.091942,0.060538,0.100276,0.213571,0.184952,0.076244,-0.183151,0.058973,-0.051769
2110,-0.103365,0.286279,0.144542,0.061785,0.060524,-0.34185,0.059221,0.600283,-0.16434,-0.161144,...,0.414792,0.170296,0.114587,0.172546,0.382992,0.337189,0.122404,-0.32632,0.100975,-0.092938
3111,-0.090193,0.239082,0.12356,0.041514,0.04108,-0.287619,0.045747,0.503722,-0.140941,-0.131172,...,0.345385,0.140464,0.090264,0.13557,0.3112,0.277317,0.115546,-0.272292,0.082719,-0.070147
4335,-0.145116,0.359396,0.173936,0.061556,0.07619,-0.421952,0.076487,0.741047,-0.199822,-0.195703,...,0.503308,0.20236,0.147002,0.191121,0.478662,0.399704,0.179769,-0.397658,0.117525,-0.109039


In [110]:
y_train

2777    1
665     1
2110    0
3111    1
4335    1
       ..
1229    1
4486    1
4968    1
2842    1
2588    1
Name: Output, Length: 4451, dtype: uint8

In [111]:
from sklearn.ensemble import RandomForestClassifier

classifier=RandomForestClassifier()


In [112]:
X_train.shape,y_train.shape

((4451, 100), (4451,))

In [None]:
classifier.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))