In [47]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer , PorterStemmer
from wordcloud import WordCloud

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, precision_recall_curve, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bibek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
df = pd.read_csv('DataSet/dataset.csv')

In [49]:

%%time
lemmatizer = WordNetLemmatizer()

df['text_lemmatized'] = df['text'].map(lambda text: ' '.join(lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text.lower())))

Wall time: 4.35 s


In [50]:
df.spam.replace(('ham', 'spam'), (0, 1), inplace=True)

In [51]:
df

Unnamed: 0,spam,text,text_lemmatized
0,0,"Go until jurong point, crazy.. Available only ...","go until jurong point , crazy.. available only..."
1,0,Ok lar... Joking wif u oni...,ok lar ... joking wif u oni ...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor ... u c already then sa...
4,0,"Nah I don't think he goes to usf, he lives aro...","nah i do n't think he go to usf , he life arou..."
...,...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
5570,0,Will ü b going to esplanade fr home?,will ü b going to esplanade fr home ?
5571,0,"Pity, * was in mood for that. So...any other s...","pity , * wa in mood for that . so ... any othe..."
5572,0,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i '...


In [52]:

X = df["text_lemmatized"]

y = df['spam'] # 1D targer vector

In [53]:
df = df.sample(frac=1)
def split_trainTest(X,y,t):
    train_size = int((1-t) * X.shape[0])   
    return X[:train_size],X[train_size:],y[:train_size],y[train_size:]

X_train, X_test, y_train, y_test = split_trainTest(X,y,t=0.2)

In [54]:
count_vect = CountVectorizer(lowercase=True, stop_words='english',binary = False)
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [55]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_counts, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [56]:
y_test_predicted = model.predict(X_test_counts)


print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted))

precision_test = precision_score(y_test, y_test_predicted) 
print("\nTest Precision = %f" % precision_test)

recall_test = recall_score(y_test, y_test_predicted)
print("Test Recall = %f" % recall_test)


f1_test = f1_score(y_test, y_test_predicted)
print("Test F1 Score = %f" % f1_test)

print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted, target_names = ["Ham", "Spam"]))


Test Confusion Matrix:
[[961   9]
 [  7 138]]

Test Precision = 0.938776
Test Recall = 0.951724
Test F1 Score = 0.945205

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       970
        Spam       0.94      0.95      0.95       145

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

