In [292]:
# Import the libaries
import pandas as pd
import numpy as np
import string
import re
import nltk

In [293]:
df = pd.read_csv(r"E:\\A.I course\\Datasets\SMS spam classification - NLP\\SMSSpamCollection", sep = '\t', names =['labels','messages'])

In [294]:
df.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [295]:
df.shape

(5572, 2)

In [296]:
# Clean the dataset

In [297]:
# Apply stemming
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [298]:
ps = PorterStemmer()

In [299]:
corpus = []
for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]',' ', df['messages'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ''.join(review)
    corpus.append(review)

In [300]:
corpus

['gojurongpointcraziavailbugingreatworldlaebuffetcinegotamorwat',
 'oklarjokewifuoni',
 'freeentriwklicompwinfacupfinaltktstmaytextfareceiventriquestionstdtxtratecappli',
 'udunsayearlihorucalreadisay',
 'nahthinkgoeusflivearoundthough',
 'freemsgheydarlweekwordbacklikefunstilltbokxxxstdchgsendrcv',
 'evenbrotherlikespeaktreatlikeaidpatent',
 'perrequestmellmelloruminnaminungintnurunguvettamsetcallertuncallerpresscopifriendcallertun',
 'winnervalunetworkcustomselectreceiveaprizerewardclaimcallclaimcodeklvalidhour',
 'mobilmonthurentitlupdatlatestcolourmobilcamerafreecallmobilupdatcofree',
 'gonnahomesoonwanttalkstuffanymortonightkcrienoughtoday',
 'sixchancwincashpoundtxtcshsendcostpdaydaytsandcapplireplihlinfo',
 'urgentweekfreemembershipprizejackpottxtwordclaimcwwwdbuknetlccltdpoboxldnwrw',
 'searchrightwordthankbreatherpromiswonttakehelpgrantfulfilpromiswonderblesstime',
 'datesunday',
 'xxxmobilemovieclubusecreditclickwaplinknexttxtmessagclickhttpwapxxxmobilemovieclubcomnqjkgighjjg

In [301]:
# Bag of words

In [302]:
from sklearn.feature_extraction.text import CountVectorizer

In [303]:
cv = CountVectorizer()

In [304]:
X = cv.fit_transform(corpus).toarray()

In [305]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [306]:
y = pd.get_dummies(df['labels'])

In [307]:
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [308]:
# Implement train_test_split

In [309]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33, random_state = 123)

In [310]:
# Training the model with Naive bayes classifier

In [311]:
from sklearn.naive_bayes import MultinomialNB

In [312]:
spam_detector = MultinomialNB().fit(X_train,y_train)

In [313]:
y_pred = spam_detector.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [314]:
# Check the accuracy using confusion matrix

In [315]:
from sklearn.metrics import confusion_matrix

In [316]:
confusion_m = confusion_matrix(y_test,y_pred)
confusion_m

array([[1587,    0],
       [ 249,    3]], dtype=int64)

In [317]:
from sklearn.metrics import accuracy_score

In [318]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8646003262642741

In [319]:
from sklearn.metrics import classification_report

In [320]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93      1587
           1       1.00      0.01      0.02       252

   micro avg       0.86      0.86      0.86      1839
   macro avg       0.93      0.51      0.48      1839
weighted avg       0.88      0.86      0.80      1839



In [321]:
from nltk.stem import WordNetLemmatizer

In [322]:
lemma = WordNetLemmatizer()

In [323]:
new = []
for i in range(0,len(df)):
    review = re.sub('[a-zA-Z]',' ', df['messages'][i])
    review = review.lower()
    review = review.split()
    review = [lemma.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ''.join(review)
    new.append(review)

In [324]:
new

[',........',
 '......',
 "2212005.87121()&'0845281007518'",
 '......',
 "',",
 "'3'!'?!,£1.50",
 '..',
 "'()'.*9",
 '!!£900!09061701461.341.12.',
 '11?!08002986030',
 "'',?'.",
 '!10020,000>1187575.150/,6,16+4',
 '!1£100,000!::81010&..44031718',
 "'...",
 '!!',
 ':,>>://..?=',
 "...':)",
 '2.....',
 '\x92.\x92',
 '-/.8707787077:,4/ú1.20365044516+',
 '?',
 '‘2',
 'ü......',
 '.3.?',
 '.?',
 ".'...'.",
 '.',
 "???'??",
 "'&;',''",
 '..!?',
 "',''",
 "2..2!'!.?",
 '.',
 '?',
 '£5/.',
 '...ü...28',
 ",''",
 '',
 '......',
 "!'?'.'!",
 '...',
 '?,,............',
 '07732584351--=+.08000930705',
 '?',
 '!.&;#&;...',
 '....',
 "'.",
 ',?',
 ",'",
 "'.'.'..",
 '..',
 '&;#&;,&;#&;',
 "'",
 '..,"".\'\'\'.\'.\'.',
 '.:.??',
 '?@&;&;',
 '!12.09061209465!,3,3,4!420-4-5.150.!',
 ",'.",
 '',
 '...',
 "$1.'..",
 '..',
 '',
 ',?',
 '..',
 ',.£1500,09066364589',
 '".."?...',
 ',&41£1000.8712118+6*£1.50(.)10,13',
 '""?\'!',
 '.',
 '.........',
 '.',
 '?',
 '..:)??',
 '...',
 '..',
 '..',
 ':).',
 '&;#&;'

In [325]:
# perform Tf-idf

In [326]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [327]:
tvec = TfidfVectorizer()

In [328]:
X = tvec.fit_transform(new).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [329]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33, random_state = 123)

In [330]:
# Use Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [331]:
spam_detector_model = MultinomialNB().fit(X_train,y_train)

In [332]:
y_pred = spam_detector_model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [333]:
# use confusion matrix

In [334]:
confusion_m = confusion_matrix(y_test,y_pred)
confusion_m

array([[1586,    1],
       [ 231,   21]], dtype=int64)

In [335]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8738444806960305

In [336]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1587
           1       0.95      0.08      0.15       252

   micro avg       0.87      0.87      0.87      1839
   macro avg       0.91      0.54      0.54      1839
weighted avg       0.88      0.87      0.83      1839

