In [32]:
from nltk.corpus import wordnet

# Test to see if wordnet can find synonyms for a word
synonyms = wordnet.synsets("test")
print(synonyms)


[Synset('trial.n.02'), Synset('test.n.02'), Synset('examination.n.02'), Synset('test.n.04'), Synset('test.n.05'), Synset('test.n.06'), Synset('test.v.01'), Synset('screen.v.01'), Synset('quiz.v.01'), Synset('test.v.04'), Synset('test.v.05'), Synset('test.v.06'), Synset('test.v.07')]


In [33]:
import os
import numpy as np
import pandas as pd
from collections import Counter
import re
# from nltk.corpus import wordnet
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# import enchant
# from nltk.tokenize import word_tokenize
import string
import nltk
import spacy

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")
# english_dict = enchant.Dict("en_US")
# def lemmatize_words(text):
#     doc = nlp(text)
#     return [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token) > 2]

def lemmatize_and_filter_words(text):
    doc = nlp(text)
    meaningful_words = []
    non_english_words_count = 0
    
    for token in doc:
        # Skip stop words, non-alphabetic tokens, and single-letter tokens
        if token.is_stop or not token.is_alpha or len(token) <= 2:
            continue
        lemma = token.lemma_.lower()
        # Check if word has meaning in English (using WordNet)
        if lemma in ENGLISH_STOP_WORDS or wordnet.synsets(lemma):
#         if lemma in ENGLISH_STOP_WORDS or english_dict.check(lemma):
            meaningful_words.append(lemma)
        else:
            non_english_words_count += 1  # Increment count for non-meaningful words
    
    return meaningful_words, non_english_words_count

def extract_words(text):
#     stop_words = set(stopwords.words('english'))
    # Remove punctuation

    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize and convert to lowercase
#     text = re.findall(r'\b\w+\b', text.lower())
#     words= word_tokenize(text)
#     # Remove stop words
#     words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
#     words = [word for word in words if word not in stop_words]
#     words = lemmatize_words(text)
#     words = [word for word in words if word not in ENGLISH_STOP_WORDS]

    words, non_english_word_count= lemmatize_and_filter_words(text)

    word_counts = Counter(words)
    return word_counts, non_english_word_count

def process_emails(folder_path):
    emails_data = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            words_counts, non_english_words_count = extract_words(content)
            words_counts["non_english_words_count"] = non_english_words_count
            emails_data.append(words_counts)
    df = pd.DataFrame(emails_data).fillna(0)
#     df = df.fillna(0)
    return df

spam_folder = r'archive/enron1/spam'
ham_folder = r'archive/enron1/ham'
spam_df = process_emails(spam_folder)
spam_df['Label'] = 1
ham_df = process_emails(ham_folder)
ham_df['Label'] = 0
combined_df = pd.concat([spam_df, ham_df], ignore_index=True)

In [34]:
combined_df

Unnamed: 0,subject,energy,level,go,introduce,doctor,formulate,human,growth,hormone,...,amortize,ordinate,dismiss,distraction,bidder,bidding,subcommittee,stengel,representatives,owes
0,1,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,...,,,,,,,,,,
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5168,1,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5169,2,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5170,2,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [35]:
column_names = combined_df.columns.tolist()

spam_dict = {col: 0.05 for col in column_names}
ham_dict = {col: 0.05 for col in column_names}
column_names= [col for col in column_names if col!="Label"]


In [36]:
column_names

['subject',
 'energy',
 'level',
 'go',
 'introduce',
 'doctor',
 'formulate',
 'human',
 'growth',
 'hormone',
 'call',
 'refer',
 'medical',
 'science',
 'master',
 'plentiful',
 'young',
 'near',
 'age',
 'body',
 'begin',
 'produce',
 'time',
 'nearly',
 'deficient',
 'eighty',
 'production',
 'normally',
 'diminish',
 'advantage',
 'increase',
 'muscle',
 'strength',
 'loss',
 'fat',
 'bone',
 'density',
 'low',
 'blood',
 'pressure',
 'quicken',
 'wound',
 'healing',
 'reduce',
 'cellulite',
 'improved',
 'vision',
 'wrinkle',
 'disappearance',
 'skin',
 'thickness',
 'texture',
 'improve',
 'sleep',
 'emotional',
 'stability',
 'memory',
 'mental',
 'alertness',
 'sexual',
 'potency',
 'resistance',
 'common',
 'illness',
 'strengthen',
 'heart',
 'control',
 'cholesterol',
 'mood',
 'swing',
 'new',
 'hair',
 'color',
 'restore',
 'read',
 'website',
 'non_english_words_count',
 'prescription',
 'ready',
 'cost',
 'medication',
 'soma',
 'prescribe',
 'online',
 'ship',
 'overn

In [37]:
len(column_names)

19190

In [38]:
count={col: 0 for col in column_names}
for i, row in combined_df.iterrows():
    if row["Label"]==1:
    
        for word in column_names:
            if row[word] >0:
                spam_dict[word] = spam_dict[word]+ row[word]
                count[word] = count.get(word, 0) + row[word]
#                 print(spam_dict[word], word)
    else:
        for word in column_names:
            if row[word] >0:
                ham_dict[word] = ham_dict[word]+ row[word]
                count[word] = count.get(word, 0) + row[word]
#                 print(ham_dict[word], word)
        

In [39]:
# P(spam/ word) = P(word/ spam)*P(spam) / P(word)
#

In [56]:
P_spam= len(spam_df)/(len(spam_df)+ len(ham_df))

total_words= sum(count.values())
total_spam_count = sum(spam_dict.values())
total_ham_count = sum(ham_dict.values())

P_word_spam = {word: (spam_dict[word]/total_spam_count) * P_spam/(count[word]/total_words) for word in column_names}
P_word_ham = {word: (ham_dict[word]/total_ham_count) * (1- P_spam)/(count[word]/total_words) for word in column_names}


In [60]:
print(P_word_spam)



In [58]:
import json

with open('ham_words_probab.json', "w") as file:
    json.dump(P_word_ham, file)

print("Dictionary saved to", 'ham_words_probab')

with open('spam_words_probab.json', "w") as file:
    json.dump(P_word_spam, file)

print("Dictionary saved to", 'spam_words_probab')

Dictionary saved to ham_words_probab
Dictionary saved to spam_words_probab


In [42]:
test_spam_path= r'archive/enron2/spam'
test_ham_path= r'archive/enron2/ham'
test_spam_df = process_emails(test_spam_path)
test_ham_df = process_emails(test_ham_path)

test_spam_df['Label'] = 1
test_ham_df['Label'] = 0
combined_test_df = pd.concat([test_spam_df, test_ham_df], ignore_index=True)

In [61]:
from sklearn.metrics import accuracy_score, f1_score
def is_spam(df,P_word_spam, P_word_ham, threshold):
    column_names_test = df.columns.tolist()
    training_words = set(P_word_spam.keys()).union(set(P_word_ham.keys()))
    spam_scores, ham_scores=[],[]
    for i, row in df.iterrows():
    
        spam_score, ham_score=0,0
        for word in training_words:
            # if word in P_word_spam:
            #     spam_score += np.log(P_word_spam[word]) * row[word]
            # else:
            #     print(f"Current count value: {count}, type: {type(count)}")
            #     spam_score += np.log(1e-6) *row[count]
            # if word in P_word_ham:
            #     ham_score+= np.log(P_word_ham[word]) * row[word]
            # else:
            #     ham_score += np.log(1e-6) *row[count[word]]
            word_count= row.get(word,0)
            if word_count>0:
                if word in P_word_spam:
                    spam_score += word_count * np.log(P_word_spam[word])
                if word in P_word_ham:
                    ham_score += word_count * np.log(P_word_ham[word])
            else:
                if word in P_word_spam:
                    spam_score += word_count*np.log(1- P_word_spam[word])
                if word in P_word_ham:
                    ham_score += word_count*np.log(1 - P_word_ham[word])
    
        spam_scores.append(spam_score)
        ham_scores.append(ham_score)
    classifications = [1 if spam_score > ham_score + threshold else 0 for spam_score, ham_score in zip(spam_scores, ham_scores)]
    accuracy= accuracy_score(combined_test_df["Label"].tolist(), classifications)
    f1= f1_score(combined_test_df["Label"].tolist(), classifications, average= 'micro')
    print(accuracy, f1)
    return spam_scores, ham_score, classifications

In [62]:
spam_scores, ham_scores, classifications =is_spam(combined_test_df,P_word_spam, P_word_ham, 0 )

  ham_score += word_count*np.log(1 - P_word_ham[word])


0.7445791360764896 0.7445791360764896


In [None]:
# for i in range(len(spam_score)):
#     true=0
#     equal=0
#     if spam_scores[i]>ham_scores[i]:
#         true+=1
#     elif spam_scores[i]=ham_scores[i]:
#         equal+=1

In [None]:
# print(true, equal)