In [1]:
# Importing Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [3]:
message = pd.read_csv('spams',sep='\t',names=["label","message"])

# By using Stemming

In [4]:
ps = PorterStemmer()

In [5]:
corpus = []
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
for i in range(0,len(message)):
    review = re.sub('[^a-zA-Z]',' ',message['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


In [7]:
cv = CountVectorizer(max_features=5000)

In [8]:
X_stem = cv.fit_transform(corpus).toarray()

In [9]:
y_stem = pd.get_dummies(message['label'])
y_stem = y_stem.iloc[:,1].values

In [10]:
X_stem_train,X_stem_test,y_stem_train,y_stem_test = train_test_split(X_stem,y_stem,test_size=0.2,random_state=0)

In [11]:
spam_detect_model_stem = MultinomialNB().fit(X_stem_train,y_stem_train)

In [12]:
y_stem_pred = spam_detect_model_stem.predict(X_stem_test)
cfmtrx_stem = confusion_matrix(y_stem_test,y_stem_pred)

In [13]:
cfmtrx_stem

array([[946,   9],
       [  8, 152]])

In [14]:
acc_stem = accuracy_score(y_stem_test,y_stem_pred)
acc_stem

0.9847533632286996

#By Using Lemmatization

In [15]:
wl = WordNetLemmatizer()

In [16]:
corpus2 = []
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
for i in range(0,len(message)):
    review = re.sub('[^a-zA-Z]',' ',message['message'][i])
    review = review.lower()
    review = review.split()
    review = [wl.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus2.append(review)


In [18]:
tfidf = TfidfVectorizer()

In [19]:
X_lemm = cv.fit_transform(corpus2).toarray()

In [20]:
y_lemm = pd.get_dummies(message['label'])
y_lemm = y_lemm.iloc[:,1].values

In [21]:
X_lemm_train,X_lemm_test,y_lemm_train,y_lemm_test = train_test_split(X_lemm,y_lemm,test_size=0.2,random_state=0)


In [22]:
spam_detect_model_lemm = MultinomialNB().fit(X_lemm_train,y_lemm_train)


In [23]:
y_lemm_pred = spam_detect_model_lemm.predict(X_lemm_test)


In [24]:
cfmtrx_lemm = confusion_matrix(y_lemm_test,y_lemm_pred)


In [25]:
cfmtrx_lemm

array([[944,  11],
       [  9, 151]])

In [26]:
accc_lemm = accuracy_score(y_lemm_test,y_lemm_pred)

In [27]:
accc_lemm

0.9820627802690582

In [28]:
acc_stem

0.9847533632286996