In [1]:
import sklearn
import pandas as pd
import joblib

In [2]:
df = pd.read_csv(r"D:\Datasets\Machine Learning\spam.csv", encoding = 'ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
df.rename(columns={'v1':'Target','v2':'Features'},inplace=True)

In [8]:
df.head()

Unnamed: 0,Target,Features
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df['Target'].unique()

array(['ham', 'spam'], dtype=object)

In [10]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
df['Target'] = Encoder.fit_transform(df['Target'])

In [11]:
X = df['Features']
y = df['Target']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
Vectorizer = CountVectorizer()
X_train_vec = Vectorizer.fit_transform(X_train)
X_test_vec = Vectorizer.transform(X_test)

In [14]:
from sklearn.naive_bayes import MultinomialNB
spam_detector = MultinomialNB()

In [15]:
spam_detector.fit(X_train_vec,y_train)

In [16]:
y_pred = spam_detector.predict(X_test_vec)

In [17]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       968
           1       0.95      0.97      0.96       147

    accuracy                           0.99      1115
   macro avg       0.97      0.98      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[[960   8]
 [  5 142]]


In [18]:
df.head()

Unnamed: 0,Target,Features
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
import re
def textcleaning(text):
    text = text.lower()  # For Making It In Lower Case
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.strip()
    return text

def prediction(email):
    cleaned_email = textcleaning(email) # Using The Above Function To Clean the Text of Email
    vectorized_email = Vectorizer.transform([cleaned_email]) # Vectorizing The Email Content
    prediction = spam_detector.predict(vectorized_email) # Making Predictions From The Model
    return 'spam' if prediction[0] == 1 else 'ham' # If Predictions = 1 Then 'Spam' Or else 'Ham'
     
email_content = "Hey Buddy How Are You Doing Wanna Join Tonight For Dinner"
result = prediction(email_content)
print(f"The email content is classified as: {result}")

The email content is classified as: ham


In [53]:
joblib.dump(spam_detector,"spam_detectiom.h5")

['spam_detectiom.h5']