In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set_style('darkgrid')
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, classification_report
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/divinity/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/divinity/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [5]:
df = pd.read_csv('mail_spam.csv')
df.head(5)

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


In [6]:
df1 = df.drop('title', axis=1)

In [7]:
df1

Unnamed: 0,text,type
0,"Hi James,\n\nHave you claim your complimentary...",spam
1,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam
...,...,...
79,"Dear Maryam, \n\n \n\nI would like to thank yo...",not spam
80,"Dear Customer,\n\nWelcome to Kilimall, Thanks ...",not spam
81,"Dear vladis163rus,\nHere is the Steam Guard co...",not spam
82,View In Browser | Log in\n \n \n\nSkrill logo\...,not spam


In [8]:
df1.shape

(84, 2)

In [9]:
df1.describe()

Unnamed: 0,text,type
count,84,84
unique,82,2
top,Model Casting Call\nThank you for taking the t...,not spam
freq,2,58


In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    84 non-null     object
 1   type    84 non-null     object
dtypes: object(2)
memory usage: 1.4+ KB


In [11]:
# Empty Values
df1.isna().sum()

text    0
type    0
dtype: int64

In [12]:
# Null Values
df1.isnull().sum()

text    0
type    0
dtype: int64

In [13]:
df1.duplicated().sum()

1

In [14]:
df1 = df1.drop_duplicates(keep='first')
df1.duplicated().sum()

0

In [15]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df1['type'] = encoder.fit_transform(df1['type'])
df1.head()

Unnamed: 0,text,type
0,"Hi James,\n\nHave you claim your complimentary...",1
1,"\nalt_text\nCongratulations, you just earned\n...",0
2,"Here's your GitHub launch code, @Mortyj420!\n ...",0
3,"Hello,\n \nThank you for contacting the Virtua...",0
4,"Hey Prachanda Rawal,\n\nToday's newsletter is ...",1


In [16]:
df1['type'].value_counts()

type
0    57
1    26
Name: count, dtype: int64

In [17]:
def preprocess_text(text):
    words = word_tokenize(text.lower())  # Tokenization and lowercase
    words = [stemmer.stem(word) for word in words if word.isalnum()]  # Stemming and removing non-alphanumeric
    words = [word for word in words if word not in stop_words]  # Removing stopwords
    return ' '.join(words)

df1['processed_text'] = df1['text'].apply(preprocess_text)

In [18]:
df1

Unnamed: 0,text,type,processed_text
0,"Hi James,\n\nHave you claim your complimentary...",1,hi jame claim complimentari gift yet compil sp...
1,"\nalt_text\nCongratulations, you just earned\n...",0,congratul earn 500 complet follow offer view p...
2,"Here's your GitHub launch code, @Mortyj420!\n ...",0,github launch code mortyj420 octocat stand nex...
3,"Hello,\n \nThank you for contacting the Virtua...",0,hello thank contact virtual reward center virt...
4,"Hey Prachanda Rawal,\n\nToday's newsletter is ...",1,hey prachanda rawal today newslett everyth nee...
...,...,...,...
79,"Dear Maryam, \n\n \n\nI would like to thank yo...",0,dear maryam would like thank applic role child...
80,"Dear Customer,\n\nWelcome to Kilimall, Thanks ...",0,dear custom welcom kilimal thank much join us ...
81,"Dear vladis163rus,\nHere is the Steam Guard co...",0,dear vladis163ru steam guard code need login a...
82,View In Browser | Log in\n \n \n\nSkrill logo\...,0,view browser log skrill logo money mover maker...


In [19]:
# Load and preprocess dataset
X = df1['text']
y = df1['type']

In [20]:
#Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
#Feature Extraction
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [22]:
# Train a Multinomial Naive Bayes classifier
mnb = MultinomialNB()
mnb.fit(X_train_vectorized, y_train)

# Predict on the test set
y_pred = mnb.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.6470588235294118
Classification Report:
              precision    recall  f1-score   support

           0       0.65      1.00      0.79        11
           1       0.00      0.00      0.00         6

    accuracy                           0.65        17
   macro avg       0.32      0.50      0.39        17
weighted avg       0.42      0.65      0.51        17



In [23]:
# Train a Random Forest classifier
rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(X_train_vectorized, y_train)

# Predict on the test set
y_pred = rfc.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.7647058823529411
Classification Report:
              precision    recall  f1-score   support

           0       0.73      1.00      0.85        11
           1       1.00      0.33      0.50         6

    accuracy                           0.76        17
   macro avg       0.87      0.67      0.67        17
weighted avg       0.83      0.76      0.72        17



In [24]:
# Train a Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train_vectorized, y_train)

# Predict on the test set
y_pred = gbc.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.7058823529411765
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.73      0.76        11
           1       0.57      0.67      0.62         6

    accuracy                           0.71        17
   macro avg       0.69      0.70      0.69        17
weighted avg       0.72      0.71      0.71        17



In [25]:
# Save the Trained Model and Vectorizer
with open('gbc.pkl', 'wb') as model_file:
    pickle.dump(gbc, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)