# **SMS Spam Classification:**

### NLP Final Project:

In [116]:
#Loading the libraries
import pandas as pd
import numpy as np

In [117]:
#Reading the csv file
messages = pd.read_csv('Dataset.csv',names = ['Category','Message'])

#Specifying the names of the columns while reading csv file (tsv--tab separated values)

In [118]:
messages.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [119]:
#Info about the data
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [120]:
#Finding missing values
messages.isnull().sum()

Category    0
Message     0
dtype: int64

In [121]:

#Shape of the dataframe
messages.shape

(5572, 2)

In [122]:
#Target variables counts
messages['Category'].value_counts()

#Data is imbalanced but for now we will continue with this

ham     4825
spam     747
Name: Category, dtype: int64

In [123]:
#Calculating length of message
mes_len=0
length=[]
for i in range(len(messages)):
    length.append(len(messages['Message'][i]))

In [124]:
#Adding Length column to the dataframe
messages['Length']=length

In [125]:
messages.head()

Unnamed: 0,Category,Message,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [126]:
#Calculating Punctuations in each message

import string
count=0
sent=""
punct=[]
for i in range(len(messages)):
    for j in messages['Message'][i]:
        if j in string.punctuation:
            count+=1
    sent+=j
    #print(count)
    punct.append(count)
if(count!=0):
  print("Punctuations are present")
  

Punctuations are present


In [127]:
#Adding punctuation length column to dataframe
messages["Punctuation"]=punct

In [128]:
#Regex
import re

#Stopwords
import nltk
nltk.download('stopwords')


nltk.download('wordnet')

nltk.download('omw-1.4')

#Lemmatization
from nltk.stem import WordNetLemmatizer
#Creating object for Lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [129]:
from nltk.corpus import stopwords
#Removal of extra characters and stop words and lemmatization
corpus = []
corpus_exp = []

#Skipping the 0th index (it's of Category)
for i in range(0,len(messages)):

    words = re.sub('[^a-zA-Z]',' ',messages['Message'][i])

    words = words.lower()
    #Splits into list of words 
    words = words.split()


    #Lemmatizing the word and removing the stopwords
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    #print(words[0])
    #Again join words to form sentences
    words = ' '.join(words)
    
    corpus.append(words)

In [130]:
#What's in Corpus
corpus[0]


'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [131]:
#Replacing Original Message with the Transformed Messages
messages['Message'] = corpus

In [132]:
# calculating punctuations after cleaning the data
import string
count=0
punct=[]
for i in range(len(messages)):
    for j in messages['Message'][i]:
        if j in string.punctuation:
            count+=1
if(count==0):
  print("No punctuations")

No punctuations


### Analysing difference between ham and spam

In [133]:
spam_messages = messages[messages['Category'] == 'spam']
ham_messages = messages[messages['Category'] == 'ham']

In [134]:
spam_messages.head()

Unnamed: 0,Category,Message,Length,Punctuation
2,spam,free entry wkly comp win fa cup final tkts st ...,155,21
5,spam,freemsg hey darling week word back like fun st...,147,37
8,spam,winner valued network customer selected receiv...,157,51
9,spam,mobile month u r entitled update latest colour...,154,53
11,spam,six chance win cash pound txt csh send cost p ...,136,67


In [135]:
ham_messages.head()

Unnamed: 0,Category,Message,Length,Punctuation
0,ham,go jurong point crazy available bugis n great ...,111,9
1,ham,ok lar joking wif u oni,29,15
3,ham,u dun say early hor u c already say,49,27
4,ham,nah think go usf life around though,61,29
6,ham,even brother like speak treat like aid patent,77,39


In [136]:
spam_messages['Length'].mean()

137.9892904953146

In [137]:
ham_messages['Length'].mean()

71.44829015544042

We can see that spam has more words than ham

In [138]:
spam_messages['Punctuation'].mean()

11330.265060240963

In [139]:
ham_messages['Punctuation'].mean()

11568.445803108809

spam messages has more punctuations than ham message as you can see in above reuslts.

### **Model Building**

In [140]:
X = messages['Message']

In [141]:
X.head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts st ...
3                  u dun say early hor u c already say
4                  nah think go usf life around though
Name: Message, dtype: object

In [142]:
y = messages['Category']

In [143]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object

### Train Test Split

In [144]:
from sklearn.model_selection import train_test_split

In [145]:
X_train , X_test , y_train , y_test = train_test_split(X , y, test_size = 0.33, random_state = 42)

In [146]:
X_train.head()

3235                                            yup comin
945     sent score sophas secondary application school...
5319                              kothi print marandratha
5528                             effect irritation ignore
247                                         asked call ok
Name: Message, dtype: object

### Demonstration of Count Vectorizer

(Bag of Words)

In [147]:
from sklearn.feature_extraction.text import CountVectorizer

In [148]:
count_vect=CountVectorizer()

In [149]:
X_train_count_vect=count_vect.fit_transform(X_train).toarray()

In [150]:
X_train_count_vect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [151]:
# 3733 are the sentences and 5772 are the words in total sentences
X_train_count_vect.shape

(3733, 5772)

**Note:-**<br>
There might be that, some words in 5772 words are not frequently present and are just appearing 1-2 times, we can reduce them using cv = CountVectorizer(max_features = 4000) (an approach)

This will only take 4000 words leading to coming of most frequent words

    We can change the max_features, according to what we want

### Demonstration of TF-IDF Vectorizer

(Term Frequency - Inverse Document Frequency)


CountVectorizer(Bag of Words) + TFIDF Transformer, Scikit-Learn has provided with a method of TFIDF vectorizer (combining two steps into one)

In [152]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [153]:
tfidf=TfidfVectorizer()

In [154]:
X_train_tfidf_vect=count_vect.fit_transform(X_train).toarray()

In [155]:
X_train_tfidf_vect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [156]:
X_train_tfidf_vect.shape

(3733, 5772)

## Pipelining

We are doing pipelining as we need to perform the same procedures for the test data to get predictions, that may be tiresome.

However what convenient about this pipeline object is that it actually can perform all these steps for you in a single cell, that means you can directly provide the data and it will be both vectorized and run the classifier on it in a single step.

Pipeline takes list of tuple.

In [157]:
from sklearn.pipeline import Pipeline

### Naive Bayer Classifier

In [158]:
from sklearn.naive_bayes import MultinomialNB

In [159]:
#each tuple takes the name you decide , next you call what you want to occur
text_mnb=Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [160]:
#Now u can directly pass the X_train dataset.
text_mnb.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

In [161]:
X_test.head()

3245    squeeeeeze christmas hug u lik frndshp den hug...
944     also sorta blown couple time recently id rathe...
1044    mmm thats better got roast b better drink good...
2484                  mm kanji dont eat anything heavy ok
812     ring come guy costume gift future yowifes hint...
Name: Message, dtype: object

In [162]:
#It will take the X_test and do all the steps, vectorize it and predict it
y_preds_mnb=text_mnb.predict(X_test)

In [163]:
#Predictions of the test data
y_preds_mnb

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [164]:
#Training score
text_mnb.score(X_train,y_train)

0.975354942405572

In [165]:
#Testing score
text_mnb.score(X_test,y_test)

0.9690048939641109

**Evaluation Metrics**

In [166]:
from sklearn.metrics import confusion_matrix

In [167]:
print(confusion_matrix(y_test,y_preds_mnb))

[[1592    1]
 [  56  190]]


In [168]:
from sklearn.metrics import classification_report

In [169]:
print(classification_report(y_test,y_preds_mnb))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1593
        spam       0.99      0.77      0.87       246

    accuracy                           0.97      1839
   macro avg       0.98      0.89      0.93      1839
weighted avg       0.97      0.97      0.97      1839



# SVM Classifier

In [170]:
from sklearn.svm import LinearSVC

In [171]:
#each tuple takes the name you decide , next you call what you want to occur
text_svm=Pipeline([('tfidf',TfidfVectorizer()),('svm',LinearSVC())])

In [172]:
#Now u can directly pass the X_train dataset.
text_svm.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svm', LinearSVC())])

In [173]:
X_test.head()

3245    squeeeeeze christmas hug u lik frndshp den hug...
944     also sorta blown couple time recently id rathe...
1044    mmm thats better got roast b better drink good...
2484                  mm kanji dont eat anything heavy ok
812     ring come guy costume gift future yowifes hint...
Name: Message, dtype: object

In [174]:
#It will take the X_test and do all the steps, vectorize it and predict it
y_preds_svm=text_svm.predict(X_test)

In [175]:
#Predictions of the test data
y_preds_svm

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [176]:
#Training score
text_svm.score(X_train,y_train)

1.0

In [None]:
#Testing score
text_svm.score(X_test,y_test)

In [178]:
from sklearn.metrics import confusion_matrix

In [179]:
print(confusion_matrix(y_test,y_preds_svm))

[[1589    4]
 [  20  226]]


In [180]:
from sklearn.metrics import classification_report

In [181]:
print(classification_report(y_test,y_preds_svm))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.98      0.92      0.95       246

    accuracy                           0.99      1839
   macro avg       0.99      0.96      0.97      1839
weighted avg       0.99      0.99      0.99      1839



## Predicting on the message 

In [182]:
text = ' Congratulations, you have won a lottery of $5000. To Won Text on,555500 '

In [183]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [184]:
# Removing Square Brackets and Extra Spaces
article_text = re.sub(r'[[0-9]*]', ' ', text)
article_text = re.sub(r's+', ' ', text)

In [185]:
# Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
formatted_article_text = re.sub(r's+', ' ', formatted_article_text)

In [186]:
# Tokenizing the sentences
sentence_list = nltk.sent_tokenize(article_text)

In [187]:
#removing stopwords from the text.
stopwords = nltk.corpus.stopwords.words('english')
word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

To find the weighted frequency, divide the frequency of the word by the frequency of the most occurring word.

In [188]:
#below is to find weights frequency
maximum_frequncy = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [189]:
#below is to get score for each sentence
sentence_scores = {}
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [190]:
#Here the heapq library has been used to pick the top 7 sentences to summarize the article.
import heapq
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
summary = ' '.join(summary_sentences)
print(summary)

 Congratulation , you have won a lottery of $5000.


In [192]:
#def to convert string to list
def Convert(summary):
    li = list(summary.split("-"))
    return li

In [193]:
ref=Convert(summary)
print(ref)

[' Congratulation , you have won a lottery of $5000.']


In [194]:
# Directly predicting on the summary of the message
type(ref)
text_mnb.predict(ref)

array(['spam'], dtype='<U4')

As you can see that the given text is predicted as Spam.