In [None]:
from google.colab import files
files.upload()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
df = pd.read_csv("SMSSpamCollection.txt", sep = '\t', header = None, names = ['label', 'message'])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.message = df.message.str.lower()

In [32]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy.. available bugis n grea...
1,ham,ok lar ... joking wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,nah n't think go usf life around though


In [6]:
for i in range(len(df.message)):
  df.message[i] = word_tokenize(df.message[i])

In [34]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy.. available bugis n grea...
1,ham,ok lar ... joking wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,nah n't think go usf life around though


In [8]:
for i in range(len(df.message)):
  df.message[i] = [word for word in df.message[i] if not word in stopwords.words("english")]

In [35]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy.. available bugis n grea...
1,ham,ok lar ... joking wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,nah n't think go usf life around though


In [10]:
for i in range(len(df.message)):
  df.message[i] = [word for word in df.message[i] if not word in string.punctuation]

In [36]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy.. available bugis n grea...
1,ham,ok lar ... joking wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,nah n't think go usf life around though


In [12]:
lemmatizer = WordNetLemmatizer()

for i in range(len(df.message)):
  df.message[i] = [lemmatizer.lemmatize(word) for word in df.message[i]]

In [38]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy.. available bugis n grea...
1,ham,ok lar ... joking wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,nah n't think go usf life around though


In [14]:
for i in range(len(df.message)):
  df.message[i] = " ".join(str(word) for word in df.message[i])

In [39]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy.. available bugis n grea...
1,ham,ok lar ... joking wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,nah n't think go usf life around though


In [16]:
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size = 0.2, random_state = 1)

In [45]:
print(x_train[0])

  (0, 4236)	0.2876189868729611
  (0, 6494)	0.3212002072917597
  (0, 2673)	0.2596495237226787
  (0, 3745)	0.3351376730371095
  (0, 3006)	0.3824572730845757
  (0, 3716)	0.42194386854815547
  (0, 5250)	0.5050939154933757
  (0, 3253)	0.23439401178071376


In [46]:
print(y_train.value_counts())
print('\n', y_test.value_counts())

ham     3857
spam     600
Name: label, dtype: int64

 ham     968
spam    147
Name: label, dtype: int64


In [19]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [49]:
feature_names = vectorizer.get_feature_names()
print(feature_names[1000:1020])

['apologise', 'apologize', 'apology', 'app', 'apparently', 'appeal', 'appear', 'appendix', 'applausestore', 'applebees', 'apples', 'application', 'apply', 'applying', 'appointment', 'appreciate', 'appropriate', 'approve', 'approved', 'approx']


In [21]:
print(len(feature_names))

7237


In [22]:
tf_idf_df = pd.DataFrame(x_train.T.todense(), index = feature_names)
print(tf_idf_df[2000:2010])

            0     1     2     3     4     ...  4452  4453  4454  4455  4456
cricketer    0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
crickiting   0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
cried        0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
crisis       0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
cro1327      0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
crore        0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
cross        0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
crossing     0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
crowd        0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0
croydon      0.0   0.0   0.0   0.0   0.0  ...   0.0   0.0   0.0   0.0   0.0

[10 rows x 4457 columns]


In [23]:
naive_bayes = MultinomialNB()
naive_bayes.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
y_predict = naive_bayes.predict(x_test)

In [25]:
accuracy = metrics.accuracy_score(y_test, y_predict)
print("accuracy= ",accuracy)

accuracy=  0.9775784753363229


In [26]:
print(metrics.classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       968
        spam       1.00      0.83      0.91       147

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [30]:
print(y_test)

1078     ham
4028     ham
958      ham
4642     ham
4674     ham
        ... 
324      ham
1163    spam
86       ham
4214     ham
90       ham
Name: label, Length: 1115, dtype: object


In [31]:
print('Confusion Matrix: ')
print(metrics.confusion_matrix(y_test, y_predict))

Confusion Matrix: 
[[968   0]
 [ 25 122]]
