EMAIL SPAM DETECTION

In [25]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [26]:
#loading the dataset
df = pd.read_csv("spam.csv.zip", encoding = "ISO-8859-1")
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [27]:
#check the missing values
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [28]:
df =df.dropna(axis=1)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [29]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
df.tail()

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [31]:
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [32]:
df.shape

(5572, 2)

In [33]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [34]:
#rename the columns
df = df.rename(columns={'v1': 'label', 'v2': 'email'})
df

Unnamed: 0,label,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [35]:
df['label'] = df['label'].replace({'ham': 'non-spam', 'spam': 'spam'})

In [36]:
df.head()

Unnamed: 0,label,email
0,non-spam,"Go until jurong point, crazy.. Available only ..."
1,non-spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,non-spam,U dun say so early hor... U c already then say...
4,non-spam,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
#converting categorical variables into numerical variables
df.loc[df['label'] == 'spam', 'label',] = 0
df.loc[df['label'] == 'non-spam', 'label',] = 1

In [38]:
#features and target variables
x=df['email'] # feature
y=df['label'] # target

In [39]:
#split the dataset into training and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [40]:
#extraction of feature
tfidf = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

#fit and transform the training data
x_train_feature = tfidf.fit_transform(x_train)

#transform the testing data
x_test_feature = tfidf.transform(x_test)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [42]:
#initialize the SVC classifier
svm = SVC()

#fit the classifier
svm.fit(x_train_feature, y_train)

SVC()

In [43]:
#predict on the testing set
y_pred = svm.predict(x_test_feature)

In [44]:
#calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)

accuracy: 0.9766816143497757


In [45]:
#print the classification report
rp = classification_report(y_test, y_pred)
print("classification report:", rp)

classification report:               precision    recall  f1-score   support

           0       0.99      0.83      0.91       150
           1       0.97      1.00      0.99       965

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
mail_message = [input("enter a message: ")]
mail_feature = tfidf.transform(mail_message)
pred = svm.predict(mail_feature)
print(pred)

if pred[0] == 1:
    print("not-spam")
else:
    print("spam")