In [1]:
import json
import os
import pickle
import numpy as np
import pandas as pd
from collections import Counter
import re
# from nltk.corpus import wordnet
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# import enchant
# from nltk.tokenize import word_tokenize
import string
import nltk
import spacy
# from nltk.corpus import wordnet
nlp = spacy.load("en_core_web_sm")

# Step 1: Load the probability dictionaries
with open('ham_words_probab3.json', "r") as file:
    P_word_ham = json.load(file)

with open('spam_words_probab3.json', "r") as file:
    P_word_spam = json.load(file)

with open('tfidf3.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
print("Vectorizer loaded successfully.")

print("Dictionaries loaded successfully.")

Vectorizer loaded successfully.
Dictionaries loaded successfully.


In [24]:

def lemmatize_and_filter_words(text):
    doc = nlp(text)
    meaningful_words = []
    # non_english_words_count = 0
    
    for token in doc:
        # Skip stop words, non-alphabetic tokens, and single-letter tokens
        if token.is_stop or not token.is_alpha or len(token) <= 2:
            continue
        lemma = token.lemma_.lower()
        meaningful_words.append(lemma)
        
    return meaningful_words

def extract_words(text):
#     stop_words = set(stopwords.words('english'))
    # Remove punctuation

    text = re.sub(r'[^\w\s]', '', text.lower())
    words= lemmatize_and_filter_words(text)

    # word_counts = Counter(words)
    return ' '.join(words)


# def process_emails_text(folder_path):
#     emails_data = []
#     for filename in os.listdir(folder_path):
#         with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
#             content = file.read()
#             words = extract_words(content)
            
#             emails_data.append(words)
#     return emails_data




def process_emails_text(folder_path):
    emails_data = []
    punc=[]
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            punctuation_count = sum([1 for char in content if char in string.punctuation])
            
            words = extract_words(content)
            punc.append(punctuation_count/len(words))
            
            emails_data.append(words)
    return emails_data, punc

def process_test_emails(spam_folder, vectorizer):
    emails, punc= process_emails_text(spam_folder)   
    
    X_vec = vectorizer.transform(emails)
    
    # Create DataFrame from the TF-IDF matrix
    df = pd.DataFrame(X_vec.toarray(), columns=vectorizer.get_feature_names_out()).fillna(0)
    df['punctuation_percent'] = punc
    df= df.astype('float32')
    return df

   

In [25]:
def is_spam(df,P_word_spam, P_word_ham, threshold):
   
    training_words = set(P_word_spam.keys()).union(set(P_word_ham.keys()))
    spam_scores, ham_scores=[],[]
    for i, row in df.iterrows():
        P_spam= 0.39
        spam_score, ham_score=np.log(P_spam),np.log(1- P_spam)
        for word in training_words:
            # if word in P_word_spam:
            #     spam_score += np.log(P_word_spam[word]) * row[word]
            # else:
            #     print(f"Current count value: {count}, type: {type(count)}")
            #     spam_score += np.log(1e-6) *row[count]
            # if word in P_word_ham:
            #     ham_score+= np.log(P_word_ham[word]) * row[word]
            # else:
            #     ham_score += np.log(1e-6) *row[count[word]]
            word_count= row.get(word,0)
            if word_count>0:
                if word in P_word_spam:
                    spam_score += word_count * np.log(P_word_spam[word])
                if word in P_word_ham:
                    ham_score += word_count * np.log(P_word_ham[word])
            # else:
            #     if word in P_word_spam:
            #         spam_score += np.log(1- P_word_spam[word])
            #     if word in P_word_ham:
            #         ham_score += np.log(1 - P_word_ham[word])
    
        spam_scores.append(spam_score)
        ham_scores.append(ham_score)
    classifications = [1 if spam_score > ham_score + threshold else 0 for spam_score, ham_score in zip(spam_scores, ham_scores)]

    return spam_scores, ham_score, classifications

In [None]:
test_spam_path= 'test'
emails_data = []
err = 0

test_df= process_test_emails(test_spam_path, vectorizer)


In [30]:
_,_, classifications= is_spam(test_df,P_word_spam, P_word_ham, 0)

In [31]:
print(classifications)

[0, 1, 0, 1]


In [32]:
df_out = pd.DataFrame(classifications)
     
# saving the dataframe
df_out.to_csv('predictions.csv')