In [None]:
#!pip install pandas

# 1. Import data

In [11]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [5]:
messages.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [12]:
messages.shape

(5572, 2)

In [13]:
messages['message'].loc[451]

'hanks lotsly!'

# 2. Data Preprocessing

In [14]:
#Data cleaning and preprocessing
import re
import nltk
#nltk.download('stopwords')

In [15]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [16]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [17]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

# 3. Text to vectors

## 3.1 BOW

In [25]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True,ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()
X.shape

(5572, 2500)

In [26]:
cv.vocabulary_

{'ok lar': 1459,
 'free entri': 641,
 'entri wkli': 552,
 'wkli comp': 2426,
 'cup final': 417,
 'std txt': 1933,
 'txt rate': 2193,
 'rate appli': 1656,
 'freemsg hey': 660,
 'like fun': 1128,
 'treat like': 2138,
 'per request': 1531,
 'request mell': 1722,
 'mell mell': 1260,
 'mell oru': 1261,
 'oru minnaminungint': 1492,
 'minnaminungint nurungu': 1289,
 'nurungu vettam': 1438,
 'vettam set': 2288,
 'set callertun': 1821,
 'callertun caller': 206,
 'caller press': 204,
 'press copi': 1622,
 'copi friend': 389,
 'friend callertun': 663,
 'winner valu': 2414,
 'valu network': 2286,
 'network custom': 1391,
 'reward claim': 1735,
 'claim call': 266,
 'call claim': 159,
 'claim code': 267,
 'valid hour': 2283,
 'entitl updat': 549,
 'updat latest': 2222,
 'latest colour': 1077,
 'colour mobil': 308,
 'free call': 636,
 'call mobil': 181,
 'mobil updat': 1317,
 'updat co': 2221,
 'co free': 287,
 'want talk': 2344,
 'chanc win': 241,
 'win cash': 2401,
 'txt csh': 2177,
 'repli hl': 17

In [24]:
y=pd.get_dummies(messages['label']) #returns false or true
y=y.iloc[:,1].values
y

array([False, False,  True, ..., False, False, False])

In [27]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [28]:
X_train, y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([False, False, False, ...,  True, False, False]))

In [29]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [30]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [31]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)

0.9730941704035875


In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       985
        True       0.81      1.00      0.90       130

    accuracy                           0.97      1115
   macro avg       0.91      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



## 3.2 TF-IDF

In [42]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range=(2,2))
X = tv.fit_transform(corpus).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
tv.vocabulary_

{'ok lar': 1485,
 'free entri': 631,
 'entri wkli': 548,
 'wkli comp': 2411,
 'cup final': 383,
 'std txt': 1946,
 'txt rate': 2177,
 'rate appli': 1680,
 'freemsg hey': 653,
 'like fun': 1129,
 'per request': 1541,
 'request mell': 1739,
 'mell mell': 1268,
 'mell oru': 1269,
 'oru minnaminungint': 1518,
 'minnaminungint nurungu': 1306,
 'nurungu vettam': 1464,
 'vettam set': 2273,
 'set callertun': 1839,
 'callertun caller': 211,
 'caller press': 209,
 'press copi': 1646,
 'copi friend': 347,
 'friend callertun': 656,
 'winner valu': 2399,
 'valu network': 2270,
 'network custom': 1416,
 'custom select': 396,
 'reward claim': 1752,
 'claim call': 265,
 'call claim': 164,
 'claim code': 266,
 'valid hour': 2267,
 'entitl updat': 545,
 'updat latest': 2203,
 'latest colour': 1098,
 'free call': 624,
 'call mobil': 186,
 'mobil updat': 1339,
 'updat co': 2202,
 'co free': 280,
 'want talk': 2329,
 'chanc win': 246,
 'win cash': 2386,
 'csh send': 378,
 'repli hl': 1726,
 'hl info': 927,

In [44]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [45]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [46]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [47]:
score=accuracy_score(y_test,y_pred)
print(score)

0.957847533632287


In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.95      0.98      1002
        True       0.71      1.00      0.83       113

    accuracy                           0.96      1115
   macro avg       0.85      0.98      0.90      1115
weighted avg       0.97      0.96      0.96      1115



In [49]:
from sklearn.ensemble import RandomForestClassifier
cls=RandomForestClassifier()
cls.fit(X_train,y_train)

In [51]:
#prediction
y_pred=cls.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)

0.9695067264573991


In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       987
        True       0.79      0.99      0.88       128

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.93      1115
weighted avg       0.98      0.97      0.97      1115



## 3.3 Word2vec Implementation

Word2vec can be:
1. CBOW
2. Skip gram

but also:

1. Pretrained model (like google new 300)
2. Train the model from scratch


Word2vec convert every word in a 300 dimension vector, if we have a sentence of 5 words we will have 5 vectors of 300 dimensions, how to solve this issue? **Avg Word2vec**, here we obtain the average of all words for each dimension

### 3.3.1 Pretrained model

In [None]:
#!pip install gensim

In [53]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [54]:
vec_king = wv['king']

In [55]:
vec_king.shape

(300,)

### 3.3.2 Word2vec model from scratch

In [60]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [61]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [59]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [62]:
simple_preprocess(corpus[0])

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [63]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [65]:
words[:10]

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [66]:
### Lets train Word2vec from scratch
import gensim
model=gensim.models.Word2Vec(words,window=5, vector_size=100,min_count=2) #window used for cbow and skipgram , min_count= words that appear at least n times

In [67]:
model.wv.index_to_key #vocabulary

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [68]:
model.corpus_count

5564

In [69]:
model.epochs

5

In [72]:
model.wv.similar_by_word('prize')

[('claim', 0.9994703531265259),
 ('line', 0.9991841912269592),
 ('call', 0.9991782307624817),
 ('draw', 0.9990257024765015),
 ('cash', 0.9990105032920837),
 ('land', 0.9989708065986633),
 ('show', 0.9989550113677979),
 ('guaranteed', 0.9988994002342224),
 ('service', 0.9988865256309509),
 ('please', 0.998782753944397)]

In [70]:
model.wv.similar_by_word('kid')

[('work', 0.9976378083229065),
 ('much', 0.9975926876068115),
 ('money', 0.9974920153617859),
 ('going', 0.9974744319915771),
 ('went', 0.997450053691864),
 ('really', 0.9974471926689148),
 ('would', 0.9974337816238403),
 ('said', 0.9974328279495239),
 ('sent', 0.9974233508110046),
 ('day', 0.9974201917648315)]

In [71]:
model.wv['kid'].shape

(100,)

### 3.3.3 Average word2vec

In [73]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)
        
    
    

In [None]:
#!pip install tqdm

In [85]:
from tqdm import tqdm #to see a progress bar in a loop
import numpy as np

In [82]:
#words , contains all the sentences that are already divided intro tokens
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [83]:
len(words)

5564

In [80]:
type(model.wv.index_to_key)

list

In [86]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    print("Hello",i)
    X.append(avg_word2vec(words[i]))

    

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Hello 0
Hello 1
Hello 2
Hello 3
Hello 4
Hello 5
Hello 6
Hello 7
Hello 8
Hello 9
Hello 10
Hello 11
Hello 12
Hello 13
Hello 14
Hello 15
Hello 16
Hello 17
Hello 18
Hello 19
Hello 20
Hello 21
Hello 22
Hello 23
Hello 24
Hello 25
Hello 26
Hello 27
Hello 28
Hello 29
Hello 30
Hello 31
Hello 32
Hello 33
Hello 34
Hello 35
Hello 36
Hello 37
Hello 38
Hello 39
Hello 40
Hello 41
Hello 42
Hello 43
Hello 44
Hello 45
Hello 46
Hello 47
Hello 48
Hello 49
Hello 50
Hello 51
Hello 52
Hello 53
Hello 54
Hello 55
Hello 56
Hello 57
Hello 58
Hello 59
Hello 60
Hello 61
Hello 62
Hello 63
Hello 64
Hello 65
Hello 66
Hello 67
Hello 68
Hello 69
Hello 70
Hello 71
Hello 72
Hello 73
Hello 74
Hello 75
Hello 76
Hello 77
Hello 78
Hello 79
Hello 80
Hello 81
Hello 82
Hello 83
Hello 84
Hello 85
Hello 86
Hello 87
Hello 88
Hello 89
Hello 90
Hello 91
Hello 92
Hello 93
Hello 94
Hello 95
Hello 96
Hello 97
Hello 98
Hello 99
Hello 100
Hello 101
Hello 102
Hello 103
Hello 104
Hello 105
Hello 106
Hello 107
Hello 108
Hello 109
Hello 110


100%|██████████| 5564/5564 [00:00<00:00, 13195.31it/s]

Hello 2684
Hello 2685
Hello 2686
Hello 2687
Hello 2688
Hello 2689
Hello 2690
Hello 2691
Hello 2692
Hello 2693
Hello 2694
Hello 2695
Hello 2696
Hello 2697
Hello 2698
Hello 2699
Hello 2700
Hello 2701
Hello 2702
Hello 2703
Hello 2704
Hello 2705
Hello 2706
Hello 2707
Hello 2708
Hello 2709
Hello 2710
Hello 2711
Hello 2712
Hello 2713
Hello 2714
Hello 2715
Hello 2716
Hello 2717
Hello 2718
Hello 2719
Hello 2720
Hello 2721
Hello 2722
Hello 2723
Hello 2724
Hello 2725
Hello 2726
Hello 2727
Hello 2728
Hello 2729
Hello 2730
Hello 2731
Hello 2732
Hello 2733
Hello 2734
Hello 2735
Hello 2736
Hello 2737
Hello 2738
Hello 2739
Hello 2740
Hello 2741
Hello 2742
Hello 2743
Hello 2744
Hello 2745
Hello 2746
Hello 2747
Hello 2748
Hello 2749
Hello 2750
Hello 2751
Hello 2752
Hello 2753
Hello 2754
Hello 2755
Hello 2756
Hello 2757
Hello 2758
Hello 2759
Hello 2760
Hello 2761
Hello 2762
Hello 2763
Hello 2764
Hello 2765
Hello 2766
Hello 2767
Hello 2768
Hello 2769
Hello 2770
Hello 2771
Hello 2772
Hello 2773
Hello 2774




In [87]:
type(X)

list

In [95]:
X[0]

array([-0.09063411,  0.2739008 ,  0.13847165,  0.01910704,  0.06158456,
       -0.31982502,  0.07493274,  0.5158162 , -0.16303189, -0.13175058,
       -0.15525693, -0.3552866 , -0.04278528,  0.12632532,  0.08431912,
       -0.26551464,  0.02103498, -0.34110245,  0.00399906, -0.4253128 ,
        0.08590871,  0.15084715,  0.0877763 , -0.10872332, -0.09372752,
        0.00686074, -0.19961768, -0.1549912 , -0.24619122,  0.03278942,
        0.3056211 ,  0.038424  ,  0.12931514, -0.2595466 , -0.13065143,
        0.28192523,  0.00629835, -0.23300491, -0.15449692, -0.42923102,
        0.04149439, -0.2297587 , -0.06233951,  0.0665985 ,  0.2520597 ,
       -0.13262689, -0.19926365,  0.01802987,  0.11240046,  0.22651273,
        0.1544586 , -0.26643223, -0.05799798, -0.02493971, -0.15594769,
        0.19469912,  0.15700436, -0.05459278, -0.2718977 ,  0.03329977,
        0.07524724,  0.14381206, -0.10323048,  0.02478808, -0.2781459 ,
        0.17792554,  0.10522729,  0.20124882, -0.27476576,  0.31

In [100]:
X_new=np.asarray(X, dtype="object")

In [101]:
X_new[0].shape

(100,)

In [102]:
X_new.shape

(5564,)