In [3]:
import pandas as pd

#Loading spam collection data to pandas dataframe
dataset = pd.read_csv('../data/SMSSpamCollection', sep='\t', header=None)
dataset

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Adding headers to this df
dataset.columns = ["label", "message"]
dataset.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Data Cleaning and text preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apm1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Stopwords in english
from nltk.corpus import stopwords
stopwords_nltk = stopwords.words('english')
stopwords_nltk

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [7]:
# Convert the list to a set for set operations
stopwords_nltk = set(stopwords_nltk)
#Remove required words from stopwords
stopwords = stopwords_nltk - {'no','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"}
#Convert back to list
stopwords = list(stopwords)

In [8]:
#Import PorterStemmer
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

In [9]:
# Cleaning 'message' in dataset using regular expressions and stemming
corpus=[]
for i in range(len(dataset)):
    review=re.sub('[^a-zA-Z0-9]',' ',dataset["message"][i])
    review=review.lower()
    review=review.split()
    review=[stemmer.stem(word) for word in review if not word in stopwords]
    review=' '.join(review)
    corpus.append(review)

In [10]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week no word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent won 1 week f

In [11]:
# Train Test Split

X=corpus #independent feature
y=pd.get_dummies(dataset['label'], drop_first=True, dtype=int) #dependent feature (We need to see only spam)

In [12]:
y

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0


In [13]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20, random_state=0)

In [14]:
# Applying Bag Of Words (BOW)
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X_train=cv.fit_transform(X_train).toarray()
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 6488))

In [15]:
X_test=cv.transform(X_test).toarray()
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(1115, 6488))

In [16]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4457, 6488)
(4457, 1)
(1115, 6488)
(1115, 1)


In [17]:
# So from this vocabulary size is 6488 which is very huge.
# So inorder to restrict the vocabulary size, we can use max_features
# for eg: max_features = 2500 means it will keep only 2500 most frequent occuring vocabularies

In [18]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20, random_state=0)

# Applying Bag Of Words (BOW)
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500) # Added max_features to CountVectorizer
X_train=cv.fit_transform(X_train).toarray()
X_test=cv.transform(X_test).toarray()

In [19]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4457, 2500)
(4457, 1)
(1115, 2500)
(1115, 1)


In [20]:
cv.vocabulary_

{'no': np.int64(1544),
 'good': np.int64(1032),
 'movi': np.int64(1469),
 'ok': np.int64(1589),
 'leav': np.int64(1279),
 'free': np.int64(971),
 'give': np.int64(1020),
 'otherwis': np.int64(1622),
 'nalla': np.int64(1503),
 'someth': np.int64(2020),
 'mayb': np.int64(1386),
 '30': np.int64(87),
 'bit': np.int64(409),
 'hope': np.int64(1117),
 'got': np.int64(1037),
 'home': np.int64(1113),
 'babe': np.int64(352),
 'still': np.int64(2075),
 'awak': np.int64(347),
 'sinc': np.int64(1976),
 'alreadi': np.int64(270),
 'workin': np.int64(2445),
 'get': np.int64(1013),
 'job': np.int64(1221),
 'said': np.int64(1878),
 'matter': np.int64(1380),
 'mind': np.int64(1414),
 'say': np.int64(1890),
 'oh': np.int64(1585),
 'yeah': np.int64(2476),
 'diet': np.int64(708),
 'window': np.int64(2424),
 'sorri': np.int64(2030),
 'thing': np.int64(2180),
 'may': np.int64(1385),
 'pub': np.int64(1761),
 'later': np.int64(1268),
 'ill': np.int64(1154),
 'call': np.int64(487),
 'even': np.int64(854),
 'idea

In [21]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20, random_state=0)

# Applying Bag Of Words (BOW) and NGrams
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500, ngram_range=(1,2)) # Added max_features and ngrams to CountVectorizer
X_train=cv.fit_transform(X_train).toarray()
X_test=cv.transform(X_test).toarray()

In [22]:
cv.vocabulary_

{'no': np.int64(1511),
 'good': np.int64(940),
 'movi': np.int64(1447),
 'ok': np.int64(1555),
 'leav': np.int64(1222),
 'free': np.int64(839),
 'give': np.int64(907),
 'otherwis': np.int64(1588),
 'someth': np.int64(1974),
 'mayb': np.int64(1360),
 '30': np.int64(93),
 'bit': np.int64(332),
 'hope': np.int64(1062),
 'got': np.int64(956),
 'home': np.int64(1059),
 'babe': np.int64(291),
 'still': np.int64(2023),
 'awak': np.int64(281),
 'sinc': np.int64(1935),
 'alreadi': np.int64(212),
 'workin': np.int64(2454),
 'get': np.int64(892),
 'job': np.int64(1148),
 'said': np.int64(1835),
 'matter': np.int64(1353),
 'mind': np.int64(1391),
 'say': np.int64(1850),
 'oh': np.int64(1553),
 'yeah': np.int64(2485),
 'diet': np.int64(656),
 'sorri': np.int64(1984),
 'thing': np.int64(2138),
 'may': np.int64(1359),
 'pub': np.int64(1725),
 'later': np.int64(1203),
 'ill': np.int64(1097),
 'call': np.int64(395),
 'even': np.int64(746),
 'idea': np.int64(1091),
 'dear': np.int64(631),
 'reach': np.i

In [23]:
# Applying ML Algorithm Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
rf_classifier=RandomForestClassifier().fit(X_train,y_train)

y_pred = rf_classifier.predict(X_test)

accuracy_score(y_test, y_pred)
print("Accuracy score is:")
print(accuracy_score(y_test, y_pred))

confusion_matrix(y_test, y_pred)
print("confusion matrix is:")
print(confusion_matrix(y_test, y_pred))

  return fit_method(estimator, *args, **kwargs)


Accuracy score is:
0.9847533632286996
confusion matrix is:
[[955   0]
 [ 17 143]]


In [24]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.98      0.98      1115



In [25]:
# Overall accuracy is 98% and Confusion Matrix shows 1 FP (False Positive) and 17 FN (False Negative) which is negligible
# So the overall model looks very good and need not check other algorithms. 
# So lets save this Random Forest model & CountVectorizer to pickle files and deploy

import pickle
pickle.dump(rf_classifier, open('../models/spam_model.pkl', 'wb')) # Save ML Model
pickle.dump(cv, open('../cv/count_vectorizer.pkl', 'wb')) # Save Vectorizer Model (Bag of Words (BOW) processor)

In [26]:
#Load ML Model and Vectorizer Model
pickled_model=pickle.load(open('../models/spam_model.pkl', 'rb'))
vectorizer_model=pickle.load(open('../cv/count_vectorizer.pkl', 'rb'))

In [None]:
# Testing
test_word="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

# Text preprocessing using NLP
from sklearn.feature_extraction.text import CountVectorizer
test_word = [test_word]
print(test_word)
test_word_transformed=vectorizer_model.transform(test_word).toarray()
prediction=pickled_model.predict(test_word_transformed)
prediction="Spam" if prediction[0] else "Not Spam"
print(prediction)

["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
Spam
