## Import the dependencies

In [1]:
import pandas as pd #for data manipulation
from sklearn.feature_extraction.text import TfidfVectorizer #for vectorizing text data
from sklearn.naive_bayes import MultinomialNB #for Naive Bayes classification
from sklearn.model_selection import train_test_split #to split data into training and testing sets
from sklearn.pipeline import make_pipeline #to chain preprocessing and classification steps
from sklearn.metrics import classification_report, roc_auc_score #import evaluation metrics
from sklearn.metrics import accuracy_score

## Data collection and pre-processing

In [2]:
#loading the data from csv file to a pandas DataFrame
mail=pd.read_csv("mail_data.csv")

In [3]:
#print the first 5 rows of the DataFrame
mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#check for null values in the DataFrame
mail.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
#label spam mail as 0; ham mail as 1;
mail["Category"]=mail["Category"].map({"ham":1,"spam":0})

In [6]:
#create input variable X containing the email text
X=mail["Message"]
#create target variable y containing the label (spam or ham)
y=mail["Category"]

#### Instantiate pre-processors

In [7]:
# instantiate a TfidfVectorizer object
vect=TfidfVectorizer(min_df=5,stop_words="english",lowercase="True")
#instantiate a MultinomialNB object for classification
clf=MultinomialNB()

In [8]:
#create a pipeline that chains the preprocessing and classification steps
pipe=make_pipeline(vect,clf)

## Split data into training and testing set

In [9]:
#Using stratification to ensure balanced classes
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.4,random_state=42)

In [10]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(3343,) (2229,) (3343,) (2229,)


## Model training and prediction

In [11]:
#Train the model on the training set using the fit() method
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(lowercase='True', min_df=5,
                                 stop_words='english')),
                ('multinomialnb', MultinomialNB())])

In [12]:
#Predict the labels for the training set
y_train_pred=pipe.predict(X_train)
#Evaluate the accuracy of the model on the training set
train_accuracy=accuracy_score(y_train,y_train_pred)
print(f"Train accuracy_score: {train_accuracy}")

Train accuracy_score: 0.9877355668561173


In [13]:
#Predict the labels for the testing set
y_test_pred=pipe.predict(X_test)
#Evaluate the accuracy of the model on the testing set
test_accuracy=accuracy_score(y_test,y_test_pred)
print(f"Train accuracy_score: {test_accuracy}")

Train accuracy_score: 0.9753252579632122


In [14]:
#Generate a classification report for the training set
train_class_report=classification_report(y_train, y_train_pred,output_dict=True)
pd.DataFrame(train_class_report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.908482,0.952047,448.0
1,0.986035,1.0,0.992969,2895.0
accuracy,0.987736,0.987736,0.987736,0.987736
macro avg,0.993018,0.954241,0.972508,3343.0
weighted avg,0.987907,0.987736,0.987485,3343.0


In [15]:
#Generate a classification report for the testing set
test_class_report=classification_report(y_test, y_test_pred,output_dict=True)
pd.DataFrame(test_class_report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.980315,0.832776,0.900542,299.0
1,0.974684,0.997409,0.985915,1930.0
accuracy,0.975325,0.975325,0.975325,0.975325
macro avg,0.977499,0.915093,0.943229,2229.0
weighted avg,0.975439,0.975325,0.974463,2229.0
