In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

In [2]:
df=pd.read_table("SMSSpamCollection",names=["Target","Text"])

In [3]:
df

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.isnull().sum()/len(df)*100

Target    0.0
Text      0.0
dtype: float64

# TEXT CLEANING.

In [5]:
map_dict={"spam":1,"ham":0}
df["Target"]=df["Target"].map(map_dict)

In [6]:
df.head(4)

Unnamed: 0,Target,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...


# KEEPING ONLY WHOSE STRINGS WITH SIZE GREATER THAN 2.

In [7]:
df["Text"]=df['Text'].apply(lambda x:" ".join([x for x in x.split(" ") if len(x)>2]))

# STEP1:- CLEANING HTML STRINGS FROM DATA

In [8]:
import re
def clean_html(x):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', x)

In [9]:
df["Text"]=df["Text"].apply(clean_html)

# sSTEP2:- CONVERTING DATA TO LOWER CASE

In [10]:
def convert_lower(X):
    return X.lower()

In [11]:
df["Text"]=df["Text"].apply(convert_lower)

In [12]:
#step3:- Function to remove special characters.

In [13]:
def remove_special(y):
    x=''
    
    for i in y:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x

In [14]:
df["Text"]=df["Text"].apply(remove_special)

# REMOVE THE STOP WORDS

In [15]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
from nltk.corpus import stopwords

In [17]:
type(df["Text"][0])

str

In [18]:
def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words("english"):
            x.append(i)
    y=x[:]                             #making new list putting all x into y
    x.clear()                          #clearing x for next loop
    return y                           #at the end returning y

In [19]:
df["Text"]=df["Text"].apply(remove_stopwords)

In [20]:
df

Unnamed: 0,Target,Text
0,0,"[jurong, point, crazy, available, bugis, great..."
1,0,"[lar, joking, wif, oni]"
2,1,"[free, entry, wkly, comp, win, cup, final, tkt..."
3,0,"[dun, say, early, hor, already, say]"
4,0,"[nah, think, goes, usf, lives, around, though]"
...,...,...
5567,1,"[2nd, time, tried, contact, 750, pound, prize,..."
5568,0,"[going, esplanade, home]"
5569,0,"[pity, mood, suggestions]"
5570,0,"[guy, bitching, acted, like, interested, buyin..."


# STEP 5 PERORM LEMATIZING.

In [21]:
#from nltk.stem import PorterStemmer
#ps=PorterStemmer()

In [22]:
#def stem_words(text):
#    y=[]
#    for i in text:
#        y.append(ps.stem(i))
#    z=y[:]
#    y.clear()
#    return z

In [23]:
#df["Text"]=df["Text"].apply(stem_words)

In [24]:
#df

In [25]:
from nltk.stem import WordNetLemmatizer
lematizer=WordNetLemmatizer()

In [26]:
def lematize_words(text):
    y=[]
    for i in text:
        y.append(lematizer.lemmatize(i))
    z=y[:]
    y.clear()
    return z

In [27]:
df["Text"]=df["Text"].apply(lematize_words)

In [28]:
df

Unnamed: 0,Target,Text
0,0,"[jurong, point, crazy, available, bugis, great..."
1,0,"[lar, joking, wif, oni]"
2,1,"[free, entry, wkly, comp, win, cup, final, tkt..."
3,0,"[dun, say, early, hor, already, say]"
4,0,"[nah, think, go, usf, life, around, though]"
...,...,...
5567,1,"[2nd, time, tried, contact, 750, pound, prize,..."
5568,0,"[going, esplanade, home]"
5569,0,"[pity, mood, suggestion]"
5570,0,"[guy, bitching, acted, like, interested, buyin..."


# JOINING BACK.

In [29]:
def join_back(list_input):
    return " ".join(list_input)
    

In [30]:
df["Text"]=df["Text"].apply(join_back)

In [31]:
df

Unnamed: 0,Target,Text
0,0,jurong point crazy available bugis great world...
1,0,lar joking wif oni
2,1,free entry wkly comp win cup final tkts 21st m...
3,0,dun say early hor already say
4,0,nah think go usf life around though
...,...,...
5567,1,2nd time tried contact 750 pound prize claim e...
5568,0,going esplanade home
5569,0,pity mood suggestion
5570,0,guy bitching acted like interested buying some...


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
vect=CountVectorizer()

In [34]:
X=vect.fit_transform(df["Text"]).toarray()

In [47]:
X.shape

(5572, 7952)

In [35]:
y=df.iloc[:,0]

In [48]:
y.shape

(5572,)

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=18)

In [37]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [38]:
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

In [39]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [40]:
y_train_predict1=clf1.predict(X_train)
y_test_predict1=clf1.predict(X_test)    

In [41]:
y_train_predict2=clf2.predict(X_train)
y_test_predict2=clf2.predict(X_test)    

In [42]:
y_train_predict3=clf3.predict(X_train)
y_test_predict3=clf3.predict(X_test)    

In [43]:
from sklearn.metrics import classification_report

In [44]:
print("*****************************GuassionNB***************************\n")
print("Train Data")
print(classification_report(y_train,y_train_predict1))
print("Test Data")
print(classification_report(y_test,y_test_predict1))

*****************************GuassionNB***************************

Train Data
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      3848
           1       0.67      1.00      0.81       609

    accuracy                           0.93      4457
   macro avg       0.84      0.96      0.88      4457
weighted avg       0.96      0.93      0.94      4457

Test Data
              precision    recall  f1-score   support

           0       0.98      0.88      0.93       977
           1       0.51      0.89      0.65       138

    accuracy                           0.88      1115
   macro avg       0.75      0.88      0.79      1115
weighted avg       0.92      0.88      0.89      1115



In [45]:
print("*****************************MultinomialNB***************************\n")
print("Train Data")
print(classification_report(y_train,y_train_predict2))
print("Test Data")
print(classification_report(y_test,y_test_predict2))

*****************************MultinomialNB***************************

Train Data
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3848
           1       0.97      0.98      0.97       609

    accuracy                           0.99      4457
   macro avg       0.98      0.99      0.98      4457
weighted avg       0.99      0.99      0.99      4457

Test Data
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       977
           1       0.88      0.94      0.91       138

    accuracy                           0.98      1115
   macro avg       0.94      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [46]:
print("*****************************BernoulliNB***************************\n")
print("Train Data")
print(classification_report(y_train,y_train_predict3))
print("Test Data")
print(classification_report(y_test,y_test_predict3))

*****************************BernoulliNB***************************

Train Data
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3848
           1       1.00      0.90      0.95       609

    accuracy                           0.99      4457
   macro avg       0.99      0.95      0.97      4457
weighted avg       0.99      0.99      0.99      4457

Test Data
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       977
           1       0.97      0.87      0.92       138

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

