In [58]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict

In [59]:
df = pd.read_csv("C:/Users/abedi/OneDrive/Desktop/Naive Bayes Spam Classification/email classification.csv")

In [60]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [62]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [63]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [64]:
# Separate ham and spam
ham_df = df[df['Category'] == 'ham']
spam_df = df[df['Category'] == 'spam']

# Downsample ham to the same number as spam
ham_sample = ham_df.sample(n=len(spam_df), random_state=42)

# Combine and shuffle
df_balanced = pd.concat([ham_sample, spam_df]).sample(frac=1, random_state=42).reset_index(drop=True)

In [65]:
# Rebuild everything using df_balanced
df = df_balanced.copy()

In [66]:
df.head()

Unnamed: 0,Category,Message
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD..."
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2,spam,Do you want a new Video handset? 750 any time ...
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty ...
4,spam,09066362231 URGENT! Your mobile No 07xxxxxxxxx...


In [67]:
import string

#preprocessing the data

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('','', string.punctuation))
    return text

In [68]:
df['Message'] = df['Message'].apply(preprocess_text)

In [69]:
df.head()

Unnamed: 0,Category,Message
0,spam,urgent important information for o2 user today...
1,spam,panasonic bluetoothhdset free nokia free moto...
2,spam,do you want a new video handset 750 any time a...
3,spam,hi if ur lookin 4 saucy daytime fun wiv busty ...
4,spam,09066362231 urgent your mobile no 07xxxxxxxxx ...


In [70]:
 # Tokenize

def tokenize(text):
    return text.split()

df['tokens'] = df['Message'].apply(tokenize)

In [71]:
df.head()

Unnamed: 0,Category,Message,tokens
0,spam,urgent important information for o2 user today...,"[urgent, important, information, for, o2, user..."
1,spam,panasonic bluetoothhdset free nokia free moto...,"[panasonic, bluetoothhdset, free, nokia, free,..."
2,spam,do you want a new video handset 750 any time a...,"[do, you, want, a, new, video, handset, 750, a..."
3,spam,hi if ur lookin 4 saucy daytime fun wiv busty ...,"[hi, if, ur, lookin, 4, saucy, daytime, fun, w..."
4,spam,09066362231 urgent your mobile no 07xxxxxxxxx ...,"[09066362231, urgent, your, mobile, no, 07xxxx..."


In [72]:
vocab = set()
for tokens in df['tokens']:
    vocab.update(tokens)

In [73]:
# Initialize dictionary

word_counts = defaultdict(lambda: {'ham':0,'spam':0})
class_counts = {'ham':0, 'spam':0}

In [74]:
for i, row in df.iterrows():
    class_label = row['Category']
    class_counts[class_label] += 1
    for word in row['tokens']:
        word_counts[word][class_label] += 1

In [None]:
# Calculate probabilities

total_samples = len(df)
prior_ham = class_counts['ham'] / total_samples
prior_spam = class_counts['spam'] / total_samples

In [76]:
# Calculate likelihood probabilities

likelihood_prob = {}
for word in vocab:
    likelihood_prob[word] = {
        'ham': (word_counts[word]['ham']+1) / (class_counts['ham'] + len(vocab)),
        'spam': (word_counts[word]['spam']+1) / (class_counts['spam'] + len(vocab))
    }

In [77]:
# Prediction function

def predict(email):
    tokens = tokenize(preprocess_text(email))
    unseen_ham = 1 / (class_counts['ham'] + len(vocab))
    unseen_spam = 1 / (class_counts['spam'] + len(vocab))

    log_prob_ham = np.log(prior_ham)
    log_prob_spam = np.log(prior_spam)

    for word in tokens:
        prob_ham = likelihood_prob.get(word, {'ham': unseen_ham})['ham']
        prob_spam = likelihood_prob.get(word, {'spam': unseen_spam})['spam']
        log_prob_ham += np.log(prob_ham)
        log_prob_spam += np.log(prob_spam)

    print("Log P(Ham):", log_prob_ham)
    print("Log P(Spam):", log_prob_spam)

    return 'ham' if log_prob_ham > log_prob_spam else 'spam'


In [78]:
# Inference
new_email = "Get a free iPhone today, click here"
predicted_label = predict(new_email)
print("Predicted label: ", predicted_label)

Log P(Ham): -45.40406366418128
Log P(Spam): -38.771398106968206
Predicted label:  spam
