In [None]:
import numpy as np
import pandas as pd 
import chardet
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#checking the encoding style of the input csv file.By mentioning the encoding style in read_csv(), we can avoid the chances of getting an error due to encoding style mismatch 
with open('spam.csv', 'rb') as rawdata:
    encode_style =  chardet.detect(rawdata.read(100000))
print(encode_style)

In [None]:
data = pd.read_csv('spam.csv',encoding='Windows-1252')

In [None]:
# checking the data information 
# the last 3 column (Unnamed: 2, Unnamed: 3, Unnamed: 4) has maximum null values.To confirm the same, another checking using isnull() will be done
data.info()

In [None]:
# there is no missing values in first two column but maximum values are missing in the last 3 columns. So, we will drop these last 3 columns 
data.isnull().sum()


In [None]:
#dropping the last 3 columns
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [None]:
# To increase the readbility , lets change the column name of first two column (v1 and v2)
data.columns = ['Label','Message']
#checking the data 
data.head(3)

In [None]:
# To understand the details information of these two column. 
data.describe()

In [None]:
#creating a new column to understand the charactertistics of two type message label 
data['Message_length'] = data['Message'].apply(len)

In [None]:
# checking the distribution of message size
data.groupby('Label')['Message_length'].describe()

The mean length of spam messages are larger than the mean length of the ham messages. Usually, the length of the spam messages are larger in length of the non-spam messages. To verify it again,we will check the the distribution plots

In [None]:
dist_message = data['Message_length'].hist(bins=100,by=data['Label'],figsize=(10,6))
dist_message[0].set_xlabel("Message Length")
dist_message[0].set_ylabel("Freequency")
dist_message[1].set_xlabel("Message Length")
dist_message[1].set_ylabel("Freequency")

From this basic EDA it is clear that the spam messages are larger in length. In message label = ham distrbution, there is a message whose length is much higher than the other messages in ham group. Now, it is difficult to get the actual length of this long message from this plot. From the output of our previous data.groupby('Label')['Message_length'].describe(), we can see that the max length is 910. Now, we can also check which message is this in ham group.



In [None]:
# To find out the message which has a length of 910 
data[data['Message_length'] == 910]['Message'].iloc[0]

# Data pre-processing

In [None]:
import string
from nltk.corpus import stopwords

In [None]:
def text_clean(message):
    
    # removing all punctuation
    nopunc = [letter for letter in message if letter not in string.punctuation]
    punc_filtered = "".join(nopunc)
    # removing all stopwords
    return [words for words in punc_filtered.split(" ") if words.lower() not in stopwords.words('english')]
    # returning the words as list 

As an example,we can see how this text_clean() works on messages :


In [None]:
data['Message'].apply(text_clean)

Before performing Vectorization, we will divide the dataset into training and test set to avoid Data leakage. Once these partitions are done,  each of these sets(training and test set) will be converted into vectors

In [None]:
from sklearn.model_selection import train_test_split

Here, we are using only the 'Message' column to perform the classfication and we are taking 70% of dataset as training data and the remainder 30% as test set.

In [None]:
x_Train,x_Test,y_Train,y_Test = train_test_split(data['Message'],data['Label'],test_size=0.3)

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
train_bow = CountVectorizer(analyzer=text_clean).fit(x_Train)

In [None]:
# total number of vocab words
print(len(train_bow.vocabulary_))

In [None]:
# to see the entire vocabulary
#train_bow.vocabulary_   # execute this command to see the entire vocabulary and the index position of each word

In [None]:
#transforming the entire training data set messages:
train_matrix = train_bow.transform(x_Train)

In [None]:
print('Shape of Sparse Matrix: ', train_matrix.shape)
print('Amount of Non-Zero occurences: ', train_matrix.nnz)

To assign a weight to each word of the vocabulary, we will use TF-IDF. The words which has higher freequency will be assigned a lower weighatge and the words which are rare and has lower freequency will be assigned a higher weighatge.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_idf_train = TfidfTransformer().fit(train_matrix)
messages_tf_idf_train = tf_idf_train.transform(train_matrix)

To confirm that the word with higher freequency has given a lower weightage than the word with lower freequency, we can consider two word from the entire document- 'want' and 'come'(more freequent)

In [None]:
print(tf_idf_train.idf_[train_bow.vocabulary_['want']])
print(tf_idf_train.idf_[train_bow.vocabulary_['come']])

as the word 'come' is more frequent than 'want' in the entire dataframe, it receives lower weightage

In [None]:
test_matrix = train_bow.transform(x_Test)
tf_idf_test = TfidfTransformer().fit(test_matrix)
messages_tf_idf_test = tf_idf_test.transform(test_matrix)

Here two scikit-learn models will be used: Naive Bayes and KNN. The accuracy of these models will be compared


# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb = MultinomialNB()
nb.fit(messages_tf_idf_train,y_Train)

In [None]:
y_pred = nb.predict(messages_tf_idf_test)

In [None]:
from sklearn.metrics import classification_report,accuracy_score
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score of Naive Bayes model is: ")
print(round(accuracy_score(y_Test,y_pred),3))

# KNN:

GridSearchCV can be used to identify the best value of k for KNN model. But instead of using GridSearchCv, we can also estimate the best value of K in the following way:

assuming that the best value of K lies in between 1 to 40. we are creating an instance of KNN here and trying to mesaure the prediction error made by that instance of KNN.



In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(messages_tf_idf_train,y_Train)
    y_pred_elbow = knn.predict(messages_tf_idf_test)
    error = np.mean((y_Test != y_pred_elbow))
    error_rate.append(error)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,linestyle='--',marker='o',markersize=8,markerfacecolor='red')
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

From the above plot, it is clear that the error rate is increaseing after K=5. The minimum error that we can get in this task is for k=3 or k=5. The error rate for k=4 will be higher than k=3 or k=5. We will check the performance for these two k value.

In [None]:
#k=3
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(messages_tf_idf_train,y_Train)
y_pred = knn_3.predict(messages_tf_idf_test)
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score of KNN(K=3): ")
print(round(accuracy_score(y_Test,y_pred),3))

In [None]:
#K=4 

knn_4 = KNeighborsClassifier(n_neighbors=4)
knn_4.fit(messages_tf_idf_train,y_Train)
y_pred = knn_4.predict(messages_tf_idf_test)
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score of KNN(K=4): ")
print(round(accuracy_score(y_Test,y_pred),3))

In [None]:
#K=5
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_5.fit(messages_tf_idf_train,y_Train)
y_pred = knn_5.predict(messages_tf_idf_test)
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score of KNN(K=4): ")
print(round(accuracy_score(y_Test,y_pred),3))

The accuracy score of K=4 is lower than the K=3 or K=5 ( as it is visible already in the error-rate vs K plot). Between K=3 and K=5, K=3 will be a good choice interms of model accuracy