## Email Classifier

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("emails.csv")

# Preprocess
X = df['text'] #first label is the associated text
y = df['spam'] #Second label is the Spam or not if spam=1
print(X[1])



Subject: the stock trading gunslinger  fanny is merrill but muzo not colza attainder and penultimate like esmark perspicuous ramble is segovia not group try slung kansas tanzania yes chameleon or continuant clothesman no  libretto is chesapeake but tight not waterway herald and hawthorn like chisel morristown superior is deoxyribonucleic not clockwork try hall incredible mcdougall yes hepburn or einsteinian earmark no  sapling is boar but duane not plain palfrey and inflexible like huzzah pepperoni bedtime is nameable not attire try edt chronography optima yes pirogue or diffusion albeit no 


In [5]:
# Remove subject in text
def remove_subject(email):
    if email.lower().startswith("subject:"):
        return email.split(":", 1)[1].strip()  # keep text after first colon
    return email

N_examples = len(X)
print(N_examples)

N = []      # Array to hold word counts
R_X = []    # Array to hold cleaned emails

for i in range(N_examples):
    cleaned_email = remove_subject(X[i])  # remove subject
    R_X.append(cleaned_email)             # store cleaned email
    word_count = len(cleaned_email.split())  # count words
    N.append(word_count)

print(N) 
print(N.index(max(N)))  # index of email with most words


5728
[323, 88, 86, 97, 51, 83, 1702, 94, 120, 94, 145, 94, 166, 1556, 116, 195, 42, 137, 12, 484, 72, 135, 126, 58, 66, 209, 175, 85, 88, 94, 15, 616, 135, 365, 405, 331, 43, 100, 53, 183, 98, 80, 1702, 48, 73, 215, 48, 534, 46, 3360, 130, 1702, 100, 37, 74, 650, 137, 121, 62, 72, 979, 1076, 34, 94, 330, 125, 219, 121, 96, 94, 21, 162, 137, 106, 323, 63, 554, 404, 100, 156, 77, 80, 40, 823, 74, 74, 313, 110, 1818, 75, 253, 113, 196, 611, 102, 71, 55, 180, 124, 94, 105, 204, 562, 60, 137, 91, 154, 40, 230, 287, 260, 50, 53, 105, 192, 17, 488, 137, 132, 42, 206, 204, 14, 89, 239, 84, 3366, 98, 137, 360, 36, 105, 107, 93, 61, 182, 137, 227, 183, 30, 38, 37, 79, 208, 127, 202, 137, 290, 13, 80, 297, 4045, 418, 94, 57, 208, 118, 73, 90, 686, 327, 233, 331, 206, 94, 60, 324, 100, 143, 3039, 61, 348, 128, 70, 90, 282, 658, 232, 103, 923, 218, 365, 60, 72, 102, 1157, 124, 99, 220, 72, 55, 216, 1483, 100, 269, 327, 96, 387, 10, 487, 84, 52, 100, 272, 21, 92, 240, 327, 457, 1693, 48, 100, 207, 1

In [6]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)#limited to get maximum valued 5000 features
X_vec = vectorizer.fit_transform(R_X)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42,stratify=y)#startify since data are ordered

# Train Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       872
           1       0.96      0.96      0.96       274

    accuracy                           0.98      1146
   macro avg       0.97      0.97      0.97      1146
weighted avg       0.98      0.98      0.98      1146

