In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn import metrics



In [None]:
data=pd.read_csv("/content/smsspamcollection.tsv",sep="\t")
data.head(10)



In [None]:
# Check for missing data

data.isnull().sum()

In [None]:
data.info()

In [None]:
# target column
data['label'].value_counts()

In [None]:
plt.xscale('log')

bins = 1.15**(np.arange(0,50))

plt.hist(data[data['label']=='ham']['length'],bins=bins,alpha=0.8)

plt.hist(data[data['label']=='spam']['length'],bins=bins,alpha=0.8)

plt.legend(('ham','spam'))

plt.show()


In [None]:
# spam message length looks longer than ham text messages



In [None]:
plt.xscale('log')

bins = 1.15**(np.arange(0,50))

plt.hist(data[data['label']=='ham']['punct'],bins=bins,alpha=0.8)

plt.hist(data[data['label']=='spam']['punct'],bins=bins,alpha=0.8)

plt.legend(('ham','spam'))

plt.show()

In [None]:
# spam tends to have higher range of values
# spam messages tend to be longer of overall values.

# ham has quite a large range of length.
#



In [None]:
# X feature data

X = data[['length','punct']]

# y is label

y = data['label']

print(X.head())

print("")

print(y.head())

In [None]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,\
random_state=42)



In [None]:
X_train.shape


In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
logistic_model=LogisticRegression(solver='lbfgs')



In [None]:
logistic_model.fit(X_train,y_train)

In [None]:
pred= logistic_model.predict(X_test)
pred

In [None]:
cm = metrics.confusion_matrix(y_test,pred)
cm

In [None]:
cm = pd.DataFrame(cm,index=['ham','spam'],columns=['ham','spam'])
cm

In [None]:
report=metrics.classification_report(y_test,pred)
report

In [None]:
report=metrics.classification_report(y_test,pred,output_dict=True)
report

In [None]:
report_1=pd.DataFrame(report).transpose()
report_1

In [None]:
# For ham recall and precision is good.

# For spam recall and precision is really low.

# Overall model is good at detecting ham but poor when it comes to spam.

# Model is 84.27% accurate



# Naive Bayes

In [None]:
navie_bayes_model=MultinomialNB()
navie_bayes_model.fit(X_train,y_train)

pred= navie_bayes_model.predict(X_test)
print(pred)

print(" ")


print(" ")

cm = pd.DataFrame(cm,index=['ham','spam'],columns=['ham','spam'])
print(cm)

print(" ")

report=metrics.classification_report(y_test,pred)
print(report)




In [None]:
# For ham recall and precision is good.

# For spam recall and precision is poor.
# we are not able to identify any spam email.

# Model is 86% accurate


# Support vector Classifier

In [None]:
svc_model=SVC()
svc_model.fit(X_train,y_train)

pred= svc_model.predict(X_test)
print(pred)

print(" ")


print(" ")

cm = pd.DataFrame(cm,index=['ham','spam'],columns=['ham','spam'])
print(cm)

print(" ")

report=metrics.classification_report(y_test,pred)
print(report)

In [None]:
# For ham recall and precision is good.

# For spam recall and precision is better compare to
# logistic regression and Naive bayes.

# Using Support vector classifier we get better result
# to identify spam and ham emails.

# Model is 86% accurate