In [79]:
from nltk.corpus import wordnet

# Test to see if wordnet can find synonyms for a word
synonyms = wordnet.synsets("test")
print(synonyms)


[Synset('trial.n.02'), Synset('test.n.02'), Synset('examination.n.02'), Synset('test.n.04'), Synset('test.n.05'), Synset('test.n.06'), Synset('test.v.01'), Synset('screen.v.01'), Synset('quiz.v.01'), Synset('test.v.04'), Synset('test.v.05'), Synset('test.v.06'), Synset('test.v.07')]


In [None]:
import os
import numpy as np
import pandas as pd
from collections import Counter
import re
# from nltk.corpus import wordnet
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# import enchant
# from nltk.tokenize import word_tokenize
import string
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_sm")

custom_stop_words = ENGLISH_STOP_WORDS.union({"please", "regards", "thank", "thanks",  "good", "morning", "afternoon", "email"})
domain_specific_spam_words = ["urgent", "bonus", "unsubscribe", "winner", "claim", "discount", "buy now", "free", "limited time offer", "sold", "click"]
domain_specific_ham_words = ["meeting", "schedule", "project", "client", "presentation", "follow-up", "update", "pfa", "attached", "agreement", "termsheet"]

def lemmatize_and_filter_words(text):
    doc = nlp(text)
    meaningful_words = []
    # non_english_words_count = 0
    
    for token in doc:
        # Skip stop words, non-alphabetic tokens, and single-letter tokens
        if token.is_stop or not token.is_alpha or len(token) <= 2:
            continue
        lemma = token.lemma_.lower()
        if lemma not in custom_stop_words:
            meaningful_words.append(lemma)
        
    return meaningful_words

def extract_words(text):
#     stop_words = set(stopwords.words('english'))
    # Remove punctuation
    # text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    # text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text.lower())
    words= lemmatize_and_filter_words(text)

    # word_counts = Counter(words)
    return ' '.join(words)

def process_emails_text(folder_path):
    emails_data = []
    punc=[]
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            punctuation_count = sum([1 for char in content if char in string.punctuation])
            
            words = extract_words(content)
            punc.append(punctuation_count/len(words))
            
            emails_data.append(words)
    return emails_data, punc

def process_train_emails(spam_folder, ham_folder, vectorizer):
    spam_emails, spam_punc= process_emails_text(spam_folder)   
    ham_emails, ham_punc= process_emails_text(ham_folder)
    combined_emails = spam_emails + ham_emails
    X_vec = vectorizer.fit_transform(combined_emails)
    
    # Create DataFrame from the TF-IDF matrix
    df = pd.DataFrame(X_vec.astype(np.float32).toarray(), columns=vectorizer.get_feature_names_out()).fillna(0)
    df['punctuation_percent'] = spam_punc + ham_punc
    # Add labels
    labels = [1] * len(spam_emails) + [0] * len(ham_emails)
    df['Label'] = labels
    df= df.astype('float32')
    return df

def process_test_emails(spam_folder, ham_folder, vectorizer):
    spam_emails, spam_punc= process_emails_text(spam_folder)   
    ham_emails, ham_punc= process_emails_text(ham_folder)
    combined_emails = spam_emails + ham_emails
    X_vec = vectorizer.transform(combined_emails)
    
    # Create DataFrame from the TF-IDF matrix
    df = pd.DataFrame(X_vec.astype(np.float32).toarray(), columns=vectorizer.get_feature_names_out()).fillna(0)
    df['punctuation_percent'] = spam_punc + ham_punc
    # Add labels
    labels = [1] * len(spam_emails) + [0] * len(ham_emails)
    df['Label'] = labels
    df= df.astype('float32')
    return df
    
vectorizer= TfidfVectorizer(min_df=0.0009,max_df=0.9, stop_words=list(custom_stop_words))
spam_folder = r'archive/enron1/spam'
ham_folder = r'archive/enron1/ham'
combined_df = process_train_emails(spam_folder, ham_folder, vectorizer)


In [81]:
import pickle
with open('3.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
print("Vectorizer saved to tfidf_vectorizer.pkl")

Vectorizer saved to tfidf_vectorizer.pkl


In [82]:
combined_df.shape

(10684, 8334)

In [83]:
column_names = combined_df.columns.tolist()

In [84]:
column_names = combined_df.columns.tolist()

spam_dict = {col: 0.1 for col in column_names}
ham_dict = {col: 0.1 for col in column_names}
column_names= [col for col in column_names if col!="Label"]


In [85]:
# count={col: 0 for col in column_names}
for i, row in combined_df.iterrows():
    if row["Label"]==1:
    
        for word in column_names:
            if row[word] >0:
                spam_dict[word] = spam_dict[word]+ row[word]
                if word in domain_specific_spam_words:
                    spam_dict[word] += row[word]
#                 
    else:
        for word in column_names:
            if row[word] >0:
                ham_dict[word] = ham_dict[word]+ row[word]
                if word in domain_specific_ham_words:
                    ham_dict[word] += row[word]
                # print(ham_dict[word], word)
        
# for i, row in combined_df.iterrows():
#     for word in column_names:
#         if row[word]>0:
#             word_count = row[word]
#             # count[word] += word_count
#             if row["Label"] == 1:  # Spam
#                 spam_dict[word] += word_count
#             else:  # Ham
#                 ham_dict[word] += word_count

In [86]:
# P(spam/ word) = P(word/ spam)*P(spam) / P(word)
#

In [87]:
P_spam= sum(combined_df["Label"]==1)/len(combined_df)
vocabulary_size = len(column_names)
total_words= len(column_names)
total_spam_count = sum(spam_dict.values())
total_ham_count = sum(ham_dict.values())

P_word_spam = {word: ((spam_dict[word])/(total_spam_count))  for word in column_names}
P_word_ham = {word: ((ham_dict[word])/(total_ham_count)) for word in column_names}


In [89]:
import json

with open('ham_words_probab3.json', "w") as file:
    json.dump(P_word_ham, file)

print("Dictionary saved to", 'ham_words_probab')

with open('spam_words_probab3.json', "w") as file:
    json.dump(P_word_spam, file)

print("Dictionary saved to", 'spam_words_probab')

Dictionary saved to ham_words_probab
Dictionary saved to spam_words_probab


In [90]:
test_spam_path= r'archive/enron2/spam'
test_ham_path= r'archive/enron2/ham'

combined_test_df = process_test_emails(test_spam_path, test_ham_path, vectorizer)

In [92]:
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
def is_spam(df,P_word_spam, P_word_ham, threshold):
    # column_names_test = df.columns.tolist()
    training_words = set(P_word_spam.keys()).union(set(P_word_ham.keys()))
    spam_scores, ham_scores=[],[]
    for i, row in df.iterrows():
    
        spam_score, ham_score=np.log(P_spam),np.log(1- P_spam)
        for word in training_words:
            word_count= row.get(word,0)
            if word_count>0:
                if word in P_word_spam:
                    spam_score += word_count * np.log(P_word_spam[word])
                if word in P_word_ham:
                    ham_score += word_count * np.log(P_word_ham[word])
            # else:
            #     if word in P_word_spam:
            #         spam_score += np.log(1- P_word_spam[word])
            #     if wor d in P_word_ham:
            #         ham_score += np.log(1 - P_word_ham[word])
    
        spam_scores.append(spam_score)
        ham_scores.append(ham_score)
    classifications = [1 if spam_score > ham_score + threshold else 0 for spam_score, ham_score in zip(spam_scores, ham_scores)]
    accuracy= accuracy_score(combined_test_df["Label"].tolist(), classifications)
    f1= f1_score(combined_test_df["Label"].tolist(), classifications, average= 'micro')
    bal= balanced_accuracy_score(combined_test_df["Label"].tolist(), classifications)
    print('Accuracy is',accuracy,',f1 micro score is',f1,' and Balanced Accuracy is', bal)
    return spam_scores, ham_score, classifications

In [93]:
spam_scores_1, ham_scores_1, classifications_1 =is_spam(combined_test_df,P_word_spam, P_word_ham, 0 )

Accuracy is 0.9682431278811678 ,f1 micro score is 0.9682431278811678  and Balanced Accuracy is 0.9635241481679495
