# Assignment 4


In [13]:
import chardet
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt


---

- Checking encoding of dataset


In [2]:
with open('./spam.csv', 'rb') as raw_data:
    result = chardet.detect(raw_data.read(100000))
result


{'encoding': 'Windows-1252', 'confidence': 0.7272080023536335, 'language': ''}

---

- Reading dataset with specified encoding


In [3]:
sms_df = pd.read_csv('./spam.csv', names=['Class', 'Text'], header=None, encoding='Windows-1252', sep="/t", engine='python')
sms_df.drop(0, inplace=True)  # Removing first header row


---

- Splitting column by , to segregate class and text of SMS


In [4]:
temp_df = sms_df['Class'].str.split(',', n=1, expand=True)
sms_df['Class'] = temp_df[0]
sms_df['Text'] = temp_df[1]


---

- Applying label encoding to convert 'SPAM'-'HAM' into binary


In [46]:
def convert(a):
    return 0 if (a == "spam") else 1


sms_df['Class'] = sms_df.apply(lambda row: convert(row[0]), axis=1)
np.where(sms_df['Class'] != 1 | 0)  # confirming is encoding is done properly


(array([], dtype=int64),)

---

- Extracting feature from text using CountVectorizer


In [70]:
# Initializing vectorizer object, stop_words parameter in the Counter Vectorizer function to remove the common words
vectorizer = CountVectorizer(stop_words='english')
# Parsing the training data text into Sparse Matrix of unique strings
sms_words_vectors = vectorizer.fit_transform(sms_df['Text'])
# print("Vocabulary: ", vectorizer.vocabulary_) # Printing the identified Unique words along with their indices
# in each message, each word has been labelled a number, and the count of each labelled word is also provided
print(sms_words_vectors[:2])


  (0, 4260)	1
  (0, 5788)	1
  (0, 2283)	1
  (0, 1278)	1
  (0, 1714)	1
  (0, 3561)	1
  (0, 8296)	1
  (0, 4386)	1
  (0, 1712)	1
  (0, 2006)	1
  (0, 3521)	1
  (0, 1058)	1
  (0, 8095)	1
  (1, 5388)	1
  (1, 4422)	1
  (1, 4229)	1
  (1, 8203)	1
  (1, 5414)	1


---

- Splitting dataset into Train-Test


In [71]:
x_train, x_test, y_train, y_test = train_test_split(sms_words_vectors, sms_df['Class'], test_size=0.33, random_state=42)


---

- Applying Multinomial Naive Bayes on the vectors


In [73]:
model = MultinomialNB().fit(x_train, y_train)


---

- Evaluating the model accuracy


In [87]:
model.score(x_test, y_test)


1.0

---

- Using the model to identify given email text


In [86]:
emails = ['Congrats!!! You got a spam O.O', 'Hi Naive Bayes sir! ;)']
# transforming the plain text into vector so that the model can test it numerically
vectorized_emails = vectorizer.transform(emails)

model_predicted = model.predict(vectorized_emails)

for i in range(len(model_predicted)):
    print('[ {} ] => [{}]'.format(emails[i], 'SPAM' if (
        model_predicted[i] == 1) else 'HAM'))


[ Congrats!!! You got a spam O.O ] => [SPAM]
[ Hi Naive Bayes sir! ;) ] => [SPAM]
