# Name: Aishwarya Khairnar

# Task 4: Email Spam detection with machine learning

In [46]:
import numpy as np
import pandas as pd
from numpy import random
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
data = pd.read_csv("spam.csv" ,encoding_errors= 'replace')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [48]:
data = data.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])

In [49]:
data = data.rename(columns={'v1': 'category','v2':'message'})
data.head(3)

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [50]:
data.shape

(5572, 2)

In [51]:
data = data[["message", "category"]]
data.head()

Unnamed: 0,message,category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [52]:
X = data['message']
y = data['category']

# Encode X using CountVectorizer()

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv

In [54]:
X_vec = cv.fit_transform(X)
X_vec

<5572x8625 sparse matrix of type '<class 'numpy.int64'>'
	with 73734 stored elements in Compressed Sparse Row format>

In [55]:
X_vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [56]:
cv.get_feature_names_out()

array(['00', '000', '000pes', ..., 'zoom', 'zouk', 'zyada'], dtype=object)

In [57]:
pd.DataFrame(X_vec.toarray(), columns = cv.get_feature_names_out())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Split the data into train and test sets

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size = 0.3, random_state = 0)

# Apply Logistic Regression on the train set

In [59]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

In [60]:
lr.fit(X_train, y_train)

# Performing predictions on X_test

In [61]:
y_pred = lr.predict(X_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

# Check Accuracy

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9778708133971292

In [63]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1432,    2],
       [  35,  203]], dtype=int64)

# observations:
- The number of correct classifications are high in this case 1445 and 187
- The number of incorrect classifications are low in this case 34 and 6
- 1432 - False Negative. Message is a spam and predictions is also a spam
- 203 - True Positive. Message is a ham and prediction is also a ham
- 35 - True Negative. Message is a spam but prediction is a ham
- 2 - False Positive. Message is a ham but prediction is a spam