In [471]:
#Email spam detective model
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer #for converting text to digits
from sklearn.naive_bayes import GaussianNB #Naive_bayes model for a continuos values
from sklearn.naive_bayes import MultinomialNB #Naive_bayes model for discrete values
from sklearn.model_selection import cross_val_score #For checking the average of the best performing model
from sklearn.model_selection import train_test_split #For splitting the dataframe into training and testing

In [473]:
df = pd.read_csv("spam.csv")

In [475]:
df.shape

(5572, 2)

In [477]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [479]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [481]:
df.duplicated().sum()

415

In [483]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [485]:
#Import a pipeline
df.Category.nunique() #Cheking for the number of unique valuesmin our 'Category' column

2

In [487]:
df.Category.values #Checking the names of those two unique values

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [489]:
from sklearn.pipeline import Pipeline #Using a pipeline to to create a channel for the models

In [491]:
#Category has two values which can be changed into bools 0 and 1 using lamda function
df["spam"] = df["Category"].apply(lambda x : 1 if x== 'spam' else 0)

In [493]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [495]:
# Dividing my dataframe into features(independent) 'X' vars and target(dependent) vars 'y'
X = df.Message
y = df.spam

In [573]:
#Using a spliting model by giving my testing data size a 20% and training data as 80%
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [575]:
#Building a pipeline for my CountVectorizer() and MultinomialNB() models
pipe = Pipeline([
    ("count_v",CountVectorizer()),
    ("nb",MultinomialNB())
    
    
    
])

In [577]:
pipe.fit(X_train,y_train)

In [579]:
#fitting my pipeline
pipe.score(X_test,y_test)

0.990667623833453

In [581]:
# checking my model performance using cross_val_score
cross_val_score(pipe,X_test,y_test)

array([0.98566308, 0.99283154, 0.97132616, 0.98920863, 0.98201439])

In [583]:
#Average of my model performance
sum(cross_val_score(pipe,X_test,y_test))/5

0.98420876202264

In [606]:
#Create a list of emails to check for either spam or not
emails = [
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
    "Nah I don't think he goes to usf, he lives around here though",
    "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
    "Even my brother is not like to speak with me. They treat me like aids patent",
    "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.",
    "Is that seriously how you spell his name?",
    "FRIENDS OF AAFIA SIDDIQUI FREE DR AAFIA SIDDIQUI: A CALL FOR JUSTICE AND CLEMENCY kampanyasıyla ilgili bir güncelleme paylaştı. Güncellemeye göz at ve kampanyayı paylaş",
    "And as a thank you, use WELCOMETOFLUER to get 40% off AI credits when you check out on the website, or tap on this link on mobile to claim the offer."
]

In [608]:
#prediction of my kind of email
pipe.predict(emails)

array([1, 0, 0, 0, 1, 0, 0, 1], dtype=int64)