<a href="https://colab.research.google.com/github/Abhisek-Tiwari/Email-Spam-Detection/blob/main/Email_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [4]:
# convert spam and ham into numbers
df['spam'] = df.Category.apply(lambda x: 1 if x == 'spam' else 0)

df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

In [9]:
# import count vectorizer to make bag of words
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4179x7489 sparse matrix of type '<class 'numpy.int64'>'
	with 55858 stored elements in Compressed Sparse Row format>

In [11]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
X_train_cv.shape

(4179, 7489)

In [15]:
v.get_feature_names_out()[3000:3010]

array(['getiing', 'geting', 'gets', 'getsleep', 'getstop', 'gettin',
       'getting', 'getzed', 'gf', 'ghodbandar'], dtype=object)

In [16]:
# Now we train the Naive Bayes
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_cv, y_train)

In [17]:
X_test_cv = v.transform(X_test)

In [18]:
# now time fro evaluation
from sklearn.metrics import classification_report

y_pred = nb.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1207
           1       0.98      0.94      0.96       186

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [23]:
# Lets make a pipeline for automation
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorzer', CountVectorizer()),
    ('nb', MultinomialNB())]
)

In [24]:
clf.fit(X_train, y_train)

In [25]:
y_preds = clf.predict(X_test)

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1207
           1       0.98      0.94      0.96       186

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393

