In [1]:
import re
import numpy as np
import pandas as pd
import nltk
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# import spam dataset:

sms_spam= pd.read_csv('SMS_spam_DF/SMSSpamCollection', sep= '\t', 
                     names= ['label', 'message'])

In [3]:
sms_spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms_spam.shape

(5572, 2)

# Data Cleaning and Preprocessing

In [5]:
#  we can do stemming or lemmatization on the given dataframe column 'message'. I prefer doing Lemmatization on any dataset.
# and I'm performing data cleaning as well here.

lemma= WordNetLemmatizer()

corpus=[]

for i in range(len(sms_spam['message'])):
    review= re.sub('[^a-zA-Z0-9]',' ', sms_spam['message'][i])
    review= review.lower()
    review= review.split()
    review= [lemma.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review= ' '.join(review)
    corpus.append(review)


In [6]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply reply

In [7]:
# Here I'm defining corpus to some variable X for easy convention purpose.

X= corpus

In [8]:
# I'm defining categorical 'label' column as y and I'm one_hot encoding it.

y= pd.get_dummies(sms_spam['label'])

In [9]:
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [10]:
# Here I'm droping one column as well as making this pandas dataframe in to an array.

y= y.iloc[:,0].values

In [11]:
y

array([1, 1, 0, ..., 1, 1, 1], dtype=uint8)

In [12]:
# Now I have X and y values and I am gonna perform train_test_split on this data before applying word embeddings to avoid data leakage issue.

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 17)

In [13]:
len(X_train), len(y_train), len(X_test), len(y_test)

(4457, 4457, 1115, 1115)

In [14]:
#let see X_train

X_train

['ok thanx',
 'urgent mobile awarded 2 000 bonus caller prize 02 09 03 2nd attempt contact call 0871 872 9755 box95qu',
 'never blame day ur life good day give u happiness bad day give u experience essential life god blessing good morning',
 'hi da today class',
 'neshanth tel r u',
 'computer fried essential part keep spare fucking idiot roommate looovvve leaving thing running full lt gt 7',
 'gas station go',
 'huh late fr dinner',
 'watching surya movie 6 pm vijay movie pokkiri',
 'yeah try scrounge something',
 'crucify c told earlier',
 'hey babe friend cancel still visit',
 'sir waiting letter',
 'omg one thing another cat worm bad day end',
 'ugh long day exhausted want cuddle take nap',
 'date sunday',
 'mind ask happened dont say uncomfortable',
 'find way include detail',
 'way ur home',
 'yeah probs last night obviously catching speak soon',
 'dating service asked 2 contact u someone shy call 09058091870 revealed pobox84 m26 3uz 150p',
 'jus finish bathing',
 '1 finish meeti

In [15]:
y_train

array([1, 0, 1, ..., 1, 1, 1], dtype=uint8)

# from here we start word embeddings (Frequence basedtechniques like BOW & TF-IDF)

# 1. Bag of Words(BOW)

In [16]:
# I'm performing BOW on X_train and X_test

from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=3000, ngram_range=(3,3), binary=True) # I'm taking top 3k features. 

In [17]:
# transform the X_train and X_test in to CV:

count_vects_train= cv.fit_transform(X_train)
count_vects_test= cv.fit_transform(X_test)

In [18]:
# lets see the shape of obtained vectors:

count_vects_train.shape

(4457, 3000)

In [19]:
count_vects_test.shape

(1115, 3000)

In [20]:
y_train.shape

(4457,)

In [21]:
y_test.shape

(1115,)

In [47]:
# lets see the vectors(BOW) of X_train and X_test:(each word in X_traina & X_test will have 3000 dims)

count_vects_train.toarray()[1].shape

(3000,)

In [23]:
count_vects_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
# lets apply the Classification Algorithm on these BOWs:(I'm applying 'MultinomialNB' Algorithm):

spam_classifier= MultinomialNB().fit(count_vects_train, y_train)

In [25]:
# model Prediction:

y_pred= spam_classifier.predict(count_vects_test)

In [26]:
# let see the accuracy and classification report

accuracy= accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [27]:
accuracy

0.8260089686098655

In [28]:
print(report)

              precision    recall  f1-score   support

           0       0.34      0.35      0.34       145
           1       0.90      0.90      0.90       970

    accuracy                           0.83      1115
   macro avg       0.62      0.62      0.62      1115
weighted avg       0.83      0.83      0.83      1115



# 2. TF-IDF

In [29]:
# performing TF-IDF on X_train and X_test:

from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf= TfidfVectorizer(max_features= 3000, ngram_range=(3,3),binary=True)

In [30]:
# lets fit and transform the model:

tf_idf_train= tf_idf.fit_transform(X_train)
tf_idf_test= tf_idf.fit_transform(X_test)

In [31]:
# lets see the shape of vectors:

tf_idf_train.shape

(4457, 3000)

In [32]:
tf_idf_test.shape

(1115, 3000)

In [33]:
# lets see the TF-IDF vectors:

tf_idf_train.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.26435088, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [34]:
tf_idf_test.toarray().shape

(1115, 3000)

In [35]:
# lets apply MultinomialNB on these TF-IDF vectors:

tf_spam_classifier= MultinomialNB().fit(tf_idf_train, y_train)

In [36]:
# prediction:

tf_y_pred= tf_spam_classifier.predict(tf_idf_test)

In [37]:
# lets see the accuracy and classification report:

tf_accuracy= accuracy_score(y_test, tf_y_pred)
tf_report= classification_report(y_test, tf_y_pred)

In [38]:
tf_accuracy  # accuracy has gone up a bit than BOW

0.8717488789237668

In [39]:
print(tf_report)

              precision    recall  f1-score   support

           0       1.00      0.01      0.03       145
           1       0.87      1.00      0.93       970

    accuracy                           0.87      1115
   macro avg       0.94      0.51      0.48      1115
weighted avg       0.89      0.87      0.81      1115

