In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv('spam.csv', encoding='latin-1')

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.shape

(5572, 5)

In [5]:
data = data[['v1', 'v2']]

In [6]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
data.isnull().sum()

v1    0
v2    0
dtype: int64

# Encoding

In [9]:
ohe = OneHotEncoder()

encoded = ohe.fit_transform(data[['v1']]).toarray()

encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(['v1']))

# encoded_df
data = pd.concat([data.drop(['v1'], axis=1), encoded_df], axis=1)
data.head()
# print(ohe.get_feature_names_out(y))


Unnamed: 0,v2,v1_ham,v1_spam
0,"Go until jurong point, crazy.. Available only ...",1.0,0.0
1,Ok lar... Joking wif u oni...,1.0,0.0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0.0,1.0
3,U dun say so early hor... U c already then say...,1.0,0.0
4,"Nah I don't think he goes to usf, he lives aro...",1.0,0.0


In [10]:
# # Text Vectorization

# tfidf = TfidfVectorizer()
# X_tfidf = tfidf.fit_transform(data['v2'])

# # Splitting the dataset into training and testing sets
# # X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data.drop(['v2'], axis=1), test_size=0.2, random_state=42)
# vector_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out(['v2']))
# data = pd.concat([data.drop(['v2'], axis=1), vector_df], axis=1)
# data.head()

# ML

In [11]:
# Spliting data into train and test 
X = data.drop(['v1_ham', 'v1_spam'], axis=1)
y = data[['v1_ham', 'v1_spam']].idxmax(axis=1).str.replace('v1_', '')


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Multinomial Naive Bayse classification model

model_pipeline = Pipeline([
    ('ifidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultinomialNB())
])

# model = MultinomialNB()_

model_pipeline.fit(X_train['v2'], y_train)

y_pred = model_pipeline.predict(X_test['v2'])
# print(f"Predicted class: {y_pred[0]}")

accuracy = accuracy_score(y_test, y_pred)

print(f"accuracy_score: {accuracy:.2f}")

accuracy_score: 0.97


In [12]:
new_pred = model_pipeline.predict(['Hay how are you doing today? I hope you are doing well. I have something for you totally free free free.'])
print(f"New prediction: {new_pred[0]}")

New prediction: ham


In [14]:
import joblib
joblib.dump(model_pipeline, 'spam_detector.pkl')

['spam_detector.pkl']