## 1. Importing the dataset

In [41]:
import pandas as pd 
messages = pd.read_csv('SMSSpamCollection.txt',sep='\t',names =['label','message'])

In [42]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [43]:
messages.shape

(5572, 2)

In [44]:
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

### Data Cleaning and Preprocessing 

In [45]:
import re 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

Stemming

In [36]:
corpus = []
for i in range(0, len(messages)):
    # Fill NaN values in the 'message' column with an empty string
    message = messages['message'][i]
    if pd.isna(message):
        message = ''  # Replace NaN with an empty string
    # Remove non-alphanumeric characters
    review = re.sub('[^a-zA-Z0-9]', ' ', message)
    # Convert to lowercase
    review = review.lower()
    # Tokenize the text
    review = nltk.word_tokenize(review)
    # Remove stopwords and perform stemming
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    # Join the tokens back into a string
    review = ' '.join(review)
    # Append the preprocessed text to the corpus
    corpus.append(review)

In [48]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [49]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

## Bag of words 

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=2500,binary=True,ngram_range=(2,2))
X= cv.fit_transform(corpus).toarray()

In [81]:
X.shape

(5572, 2500)

In [82]:
y

array([False, False,  True, ..., False, False, False])

In [83]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values 

In [84]:
y

array([False, False,  True, ..., False, False, False])

## Train-Test Split

In [85]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Assuming you have X and y as your features and target labels
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [86]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Multinomial Naive Bayes 

In [87]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

## Prediction

In [88]:
y_pred = spam_detect_model.predict(X_test)

In [89]:
from sklearn.metrics import accuracy_score , classification_report

In [90]:
score = accuracy_score(y_test,y_pred)

In [91]:
score*100
print(f"{round(score*100,2)}%")

97.22%


In [92]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       986
        True       0.81      1.00      0.89       129

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



## TF-IDF

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500)
X= tv.fit_transform(corpus).toarray()

## Train Test Split

In [94]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [95]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [96]:

y_pred=spam_detect_model.predict(X_test)

## Prediction

In [97]:
from sklearn.metrics import accuracy_score , classification_report

In [98]:
score = accuracy_score(y_test,y_pred)
print(f"{round(score*100,2)}%")

98.48%


In [99]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       972
        True       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



RandomForest


In [101]:
from sklearn.ensemble import RandomForestClassifier
Rf = RandomForestClassifier()


In [103]:
Rf.fit(X_train, y_train)

In [106]:
y_pred = Rf.predict(X_test)

In [107]:
score = accuracy_score(y_test,y_pred)
print(f"{round(score*100,2)}%")

98.3%


In [108]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       972
        True       0.89      0.99      0.94       143

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



## Word2vec

In [109]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [110]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [111]:
vec_king = wv['king']

In [112]:
vec_king 

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [113]:

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [114]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [115]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

In [150]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', 'What you doing?how are you?'],
 [0, '', 'Where @'],
 [0, '', '645'],
 [0, '', 'Can a not?'],
 [0, '', ':) '],
 [0, '', 'What you doing?how are you?'],
 [0, '', ':( but your not here....'],
 [0, '', ':-) :-)']]

In [151]:

from nltk import sent_tokenize #function is used to tokenize text into sentences, 
from gensim.utils import simple_preprocess #function is used for tokenization of text into words and performing basic preprocessing.

In [152]:

words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [119]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [153]:
import gensim

## Train the model from Scratch

In [154]:
model = gensim.models.Word2Vec(words,window=5,min_count=2) # a `min_count=10` means to only keep words with at least 10 training-examples. 

In [155]:
model.wv.index_to_key #retrieves a list of all the words (vocabulary) in the word embeddings model.

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [156]:
model.corpus_count

5564

In [157]:
model.epochs

5

In [158]:
model.wv.similar_by_word('happy') # similar words from the dataset

[('year', 0.9994856715202332),
 ('day', 0.999478280544281),
 ('make', 0.9994494915008545),
 ('hello', 0.9994196891784668),
 ('like', 0.9994103312492371),
 ('amp', 0.9993985295295715),
 ('keep', 0.9993963241577148),
 ('love', 0.9993930459022522),
 ('money', 0.9993928670883179),
 ('one', 0.9993844032287598)]

In [159]:
model.wv['kid'].shape

(100,)

## AvgWord2Vec

In [189]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [190]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [191]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

vec_king = wv['king']

In [193]:
import pandas as pd
messages=pd.read_csv('SMSSpamCollection.txt',
                    sep='\t',names=["label","message"])

In [194]:
messages.shape

(5572, 2)

In [196]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [197]:

import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [198]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [199]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [200]:

corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [201]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess


In [202]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [203]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [204]:

model.corpus_count

5564

In [205]:
model.epochs

5

In [206]:
model.wv.similar_by_word('good')

[('day', 0.9996791481971741),
 ('give', 0.9996708035469055),
 ('much', 0.9996586441993713),
 ('got', 0.9996504187583923),
 ('going', 0.9996432065963745),
 ('night', 0.9996276497840881),
 ('get', 0.999627411365509),
 ('like', 0.999626636505127),
 ('want', 0.9996259212493896),
 ('one', 0.9996163845062256)]

In [207]:

model.wv['good'].shape

(100,)

In [208]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [209]:
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable


In [210]:
from tqdm import tqdm

In [211]:

words[75]

['am', 'waiting', 'machan', 'call', 'me', 'once', 'you', 'free']

In [137]:
type(model.wv.index_to_key)

list

In [212]:
import numpy as np

In [219]:


#apply for the entire sentences
import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))


100%|██████████| 5569/5569 [00:01<00:00, 4393.89it/s]


In [220]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [221]:
len(X)

5569

In [222]:
type(X)

list

In [223]:
X_new = np.array(X)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5569,) + inhomogeneous part.

In [169]:
## Dependent Features
## Output Features
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [170]:

y.shape

(5564,)

In [171]:
X[0].reshape(1,-1).shape

(1, 100)

In [172]:
## this is the final independent features
df=pd.DataFrame()
for i in range(0,len(X)):
    df=df.append(pd.DataFrame(X[i].reshape(1,-1)),ignore_index=True)

AttributeError: 'DataFrame' object has no attribute 'append'

In [173]:

df.head()

In [174]:

df['Output']=y

In [175]:
df.head()

Unnamed: 0,Output
0,True
1,True
2,False
3,True
4,True


In [176]:
df.dropna(inplace=True)

In [177]:

df.isnull().sum()

Output    0
dtype: int64

In [178]:

## Independent Feature
X=df

In [179]:

X.isnull().sum()

Output    0
dtype: int64

In [180]:

y=df['Output']

In [181]:

## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [182]:

X_train.head()

Unnamed: 0,Output
35,True
4162,True
295,False
5032,True
4646,True


In [183]:

y_train

35       True
4162     True
295     False
5032     True
4646     True
        ...  
5247     True
5164     True
4468     True
1477     True
5530     True
Name: Output, Length: 4451, dtype: bool

In [184]:

from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [185]:
classifier.fit(X_train,y_train)

In [186]:
y_pred=classifier.predict(X_test)

In [187]:

from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

1.0


In [188]:

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       145
        True       1.00      1.00      1.00       968

    accuracy                           1.00      1113
   macro avg       1.00      1.00      1.00      1113
weighted avg       1.00      1.00      1.00      1113

