In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px
import re
from sklearn.metrics import classification_report, accuracy_score
import scipy.sparse as sp

In [None]:
mail_data = pd.read_csv('/content/emails.csv')

In [None]:
# Rename 'spam' column to 'Label'
mail_data.rename(columns={'spam': 'Label'}, inplace=True)
mail_data.rename(columns={'text': 'Body'}, inplace=True)

In [None]:
mail_data.head()

Unnamed: 0,Body,Label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    5728 non-null   object
 1   Label   5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [None]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5728, 2)

In [None]:
mail_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Label,5728.0,0.238827,0.426404,0.0,0.0,0.0,0.0,1.0


In [None]:
len(mail_data['Body'].unique())

5695

In [None]:
mail_data.isnull().sum()

Unnamed: 0,0
Body,0
Label,0


In [None]:
mail_data.duplicated().sum()

33

In [None]:
duplicate=mail_data[mail_data.duplicated(keep='last')]
duplicate

Unnamed: 0,Body,Label
1417,"Subject: day off tuesday stinson , i would l...",0
1508,Subject: re : enron weather research good aft...,0
1532,Subject: schedule and more . . dr . kaminski ...,0
1667,"Subject: re : summer work . . jinbaek , this...",0
1749,"Subject: term paper dr . kaminski , attached...",0
1791,"Subject: re : contact info glenn , please , ...",0
1801,Subject: departure of grant masson the resear...,0
1828,"Subject: re : term project : brian , no prob...",0
1881,Subject: research allocations to egm hi becky...,0
1963,"Subject: re : schedule and more . . jinbaek ,...",0


In [None]:
counts = mail_data['Label'].value_counts().reset_index()
counts.columns = ['Label', 'Count']
# Create a bar plot using Plotly Express
fig = px.bar(counts, x='Label', y='Count', color='Label')
fig.update_layout(title='Number of Spam and Ham Emails', xaxis_title='Label', yaxis_title='Count')
fig.update_xaxes(tickvals=[0, 1], ticktext=['Ham', 'Spam'])
fig.show()


In [None]:
# separating the data as texts and label

X = mail_data['Body']

Y = mail_data['Label']

In [None]:
print(X)

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: Body, Length: 5728, dtype: object


In [None]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: Label, Length: 5728, dtype: int64


In [None]:
# Check class distribution
print(mail_data['Label'].value_counts())

Label
0    4360
1    1368
Name: count, dtype: int64


In [None]:
# Shuffle the dataset
mail_data = mail_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows
print(mail_data.head())

                                                Body  Label  \
0  Subject: wti - new eol product  ted ,  enclose...      0   
1  Subject: claim your free $ 1000 home depot gif...      1   
2  Subject: you don _ t know how to attract custo...      1   
3  Subject: you want to submit your website to se...      1   
4  Subject: impress your girl with a huge cumshot...      1   

                                          Clean_Body  Num_URLs  Has_Attachment  
0  subject  wti   new eol product  ted    enclose...         0           False  
1  subject  claim your free   1000 home depot gif...         0           False  
2  subject  you don _ t know how to attract custo...         0           False  
3  subject  you want to submit your website to se...         0           False  
4  subject  impress your girl with a huge cumshot...         0           False  


In [None]:
print(mail_data['Label'].head(10))

0    0
1    0
2    0
3    0
4    0
5    1
6    0
7    0
8    0
9    1
Name: Label, dtype: int64


In [None]:
# Check class distribution
print(mail_data['Label'].value_counts())

Label
0    4360
1    1368
Name: count, dtype: int64


In [None]:
# Preprocessing Function
def preprocess_email(text):
    # Handle missing values
    if pd.isnull(text):
        text = ""

    # Count URLs
    num_urls = len(re.findall(r'(https?://\S+)', text))

    # Check for attachments (simple keyword-based detection)
    has_attachment = bool(re.search(r'attachment|attached|file', text, re.IGNORECASE))

    # Remove URLs
    text = re.sub(r'(https?://\S+)', '', text)

    # Remove non-alphanumeric characters
    text = re.sub(r'\W', ' ', text)

    # Lowercase all text
    text = text.lower()

    return text, num_urls, has_attachment

# Apply Preprocessing to 'body'
mail_data['Clean_Body'], mail_data['Num_URLs'], mail_data['Has_Attachment'] = zip(*mail_data['Body'].map(preprocess_email))

# Display Processed Data
print(mail_data.head())



                                                Body  Label  \
0  Subject: re : energy derivatives conference - ...      0   
1  Subject: financial maths course , part 2  vinc...      0   
2  Subject: re : bullet points  please respond to...      0   
3  Subject: re : enron default swaps  darrell ,  ...      0   
4  Subject: re : power question  steve ,  elena c...      0   

                                          Clean_Body  Num_URLs  Has_Attachment  
0  subject  re   energy derivatives conference   ...         0            True  
1  subject  financial maths course   part 2  vinc...         0           False  
2  subject  re   bullet points  please respond to...         0            True  
3  subject  re   enron default swaps  darrell    ...         0           False  
4  subject  re   power question  steve    elena c...         0           False  


In [None]:
#Feature Extraction
# TF-IDF Vectorization for Text Body
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.7,
    min_df=5,
    max_features=3000,
    lowercase=True
)

X_text = vectorizer.fit_transform(mail_data['Clean_Body'])

# Combine Text and Numeric Features
X_combined = sp.hstack([
    X_text,
    np.array(mail_data['Num_URLs']).reshape(-1,1),
    np.array(mail_data['Has_Attachment']).reshape(-1,1)
])

# Define Labels
y = mail_data['Label']

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train Logistic Regression Model
log_reg_model = LogisticRegression(class_weight='balanced', max_iter=1000)
log_reg_model.fit(X_train, y_train)

In [None]:
# prediction on training data

prediction_on_training_data = log_reg_model.predict(X_train)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9914884329986905


In [None]:
# Evaluate Model
y_pred = log_reg_model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.9834205933682374

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       872
           1       0.95      0.99      0.97       274

    accuracy                           0.98      1146
   macro avg       0.97      0.98      0.98      1146
weighted avg       0.98      0.98      0.98      1146



Using Random forest Model

In [None]:
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)
# Random Forest Evaluation
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9834205933682374

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       872
           1       0.96      0.97      0.97       274

    accuracy                           0.98      1146
   macro avg       0.98      0.98      0.98      1146
weighted avg       0.98      0.98      0.98      1146



Using Naive Bayes Model

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
# Naive Bayes Evaluation
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.9790575916230366

Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       872
           1       0.97      0.94      0.96       274

    accuracy                           0.98      1146
   macro avg       0.98      0.97      0.97      1146
weighted avg       0.98      0.98      0.98      1146



In [None]:
#Predict on a Sample Email (for each model)
def predict_email(email_text, model):
    text, num_urls, has_attachment = preprocess_email(email_text)
    text_vector = vectorizer.transform([text])
    features = sp.hstack([
        text_vector,
        np.array([num_urls]).reshape(1, -1),
        np.array([has_attachment]).reshape(1, -1)
    ])
    prediction = model.predict(features)
    return "Spam" if prediction[0] == 1 else "Ham"


In [None]:
# Example Prediction for each model
sample_email = """
Subject: Free Gift Card Offer!
Click here: http://spamlink.com to claim your reward.
"""

In [None]:
# Test on Logistic Regression Model
print("\nLogistic Regression Sample Email Prediction:", predict_email(sample_email, log_reg_model))

# Test on Random Forest Model
print("\nRandom Forest Sample Email Prediction:", predict_email(sample_email, rf_model))

# Test on Naive Bayes Model
print("\nNaive Bayes Sample Email Prediction:", predict_email(sample_email, nb_model))


Logistic Regression Sample Email Prediction: Spam

Random Forest Sample Email Prediction: Spam

Naive Bayes Sample Email Prediction: Spam


In [None]:
# Example Prediction for each model
sample_email = """
Subject: re : new color printer  sorry ,  don ' t we need to know the cost , as well .  - - - - - - - - - - - - - - - - - - - - - - forwarded by kevin g moore / hou / ect on 12 / 14 / 99 08 : 15  am - - - - - - - - - - - - - - - - - - - - - - - - - - -  kevin g moore  12 / 14 / 99 08 : 09 am  to : shirley crenshaw / hou / ect @ ect , mike a roberts / hou / ect @ ect  cc :  subject : re : new color printer  this information was also sent to it purchasing .  i need to know what options we have and how soon it  can be delivered .  don ' t we need to know as well ? before purchase .  i also need a central location for this printer .  thanks  kevin moore  sam mentioned hp 4500 , i will check into it .  - - - - - - - - - - - - - - - - - - - - - - forwarded by kevin g moore / hou / ect on 12 / 14 / 99 08 : 05  am - - - - - - - - - - - - - - - - - - - - - - - - - - -  shirley crenshaw  12 / 14 / 99 07 : 55 am  to : kevin g moore / hou / ect @ ect  cc :  subject : re : new color printer  kevin :  what kind of information do you need ? i thought you were going to look  at some colored printer literature . sam seemed to be aware of a  colored printer that might work for us . ask him . i don ' t think we need  anything as big as " sapphire " .  it will be located in your area on the 19 th floor .  thanks !  kevin g moore  12 / 14 / 99 06 : 27 am  to : shirley crenshaw / hou / ect @ ect , vince j kaminski / hou / ect @ ect , mike a  roberts / hou / ect @ ect  cc :  subject : new color printer  we are in need of a new color printer .  we are also in the process of moving to the 19 th floor .  we need the color printer a . s . a . p .  if you would please , i need information concerning this  matter whereby , we can get the printer ordered and delivered  to our new location .  thanks  kevin moore
"""

In [None]:
# Test on Logistic Regression Model
print("\nLogistic Regression Sample Email Prediction:", predict_email(sample_email, log_reg_model))

# Test on Random Forest Model
print("\nRandom Forest Sample Email Prediction:", predict_email(sample_email, rf_model))

# Test on Naive Bayes Model
print("\nNaive Bayes Sample Email Prediction:", predict_email(sample_email, nb_model))


Logistic Regression Sample Email Prediction: Ham

Random Forest Sample Email Prediction: Ham

Naive Bayes Sample Email Prediction: Ham
