In [None]:
import pandas as pd
import numpy as np
import spacy
import re

In [None]:
emails=pd.read_csv('/content/mail_data.csv')

In [None]:
nlp = spacy.load("en_core_web_sm") 

In [None]:
# remove Duplicates
emails.drop_duplicates(inplace=True)

In [None]:
def preprocessing(message):
    text=re.sub("(http|https)\S+","",message)
    doc= nlp(text)
    newText=[token.lemma_.lower() for token in doc if not ( token.is_punct or token.is_stop or token.is_digit)]
    return ' '.join(newText)

In [None]:
emails['Message']=emails['Message'].apply(preprocessing)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X=emails['Message']
Y=emails['Category']

In [None]:
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit(X)

In [None]:
emails['Category'].replace({'spam':0,'ham':1},inplace=True)

In [None]:
from sklearn.metrics import f1_score , confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
X_train, X_test , Y_train , y_test = train_test_split(X,Y,stratify=Y,random_state=10,test_size=0.2)

In [None]:
classifier=MultinomialNB()

In [None]:
X_train_enCoding=vectorizer.transform(X_train)
classifier.fit(X_train_enCoding,Y_train)

In [None]:
X_test_enCoding=vectorizer.transform(X_test)
Y_predaction=classifier.predict(X_test_enCoding)

In [None]:
f1_score(y_test,Y_predaction)

In [None]:
cm = confusion_matrix(y_test, Y_predaction, labels=[1, 0])

In [None]:
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['spam', 'ham'], yticklabels=['spam', 'ham'])
plt.xlabel('predation')
plt.ylabel('true label')
plt.show()

In [None]:
# classifaction system
# Spam --> 0
# Ham  --> 1

text=emails['Message'][10]
text=preprocessing(text)
text=vectorizer.transform([text])
predactValue=classifier.predict(text)
if predactValue[0]==1:
   print('this is Ham Email')
else :
  print('this is Spam Email')
