In [None]:
import os
import re
import math
import random
import numpy as np
import pandas as pd
from math import log
from math import sqrt
import cv2
import numpy as np
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow


In [None]:
subjects = []
for i in range(1, 100):
    file_name = f"{i}"
    try:
        with open(file_name) as file:
            lines = file.readlines()
            sub = None
            for line in lines:
                if line.strip():
                    words = line.split(' ', 1)
                    if words[0] == 'Subject:':
                        sub = words[1].strip()
                        break
            if sub:
                subjects.append(sub)
    except FileNotFoundError:
        print(f"File {i} not found.")
    except Exception as e:
        print(f"Error processing file {i}: {str(e)}")
for i, subject in enumerate(subjects, 1):
    print(f"Subject of Email {i}: {subject}")

In [None]:
def readAllEmails(path):
    email_dic = {}
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            subject = re.search(r'Subject: (.+)', content)
            if subject:
                subject = subject.group(1)
                email_body = content.replace(f"Subject: {subject}", '')
                email_dic[subject] = email_body
    return email_dic

In [None]:
def getRandomSubject(email_dic):
    return random.choice(list(email_dic.keys()))

In [None]:
def to_lower(sample):
    return sample.lower()
def remove_stop_words(sample, stop_words):
    words = sample.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)
def normalize_data(sample, stop_words):
    sample = to_lower(sample)
    sample = remove_stop_words(sample, stop_words)
    sample = re.sub(r'[\[\]\(\)\.,;!?]', '', sample)
    return sample
def unique_words(sample):
    words = sample.split()
    return list(set(words))

In [None]:
def bag_of_words(sample_list):
    all_unique_words = set()
    for sample in sample_list:
        words = unique_words(sample)
        all_unique_words.update(words)
    return list(all_unique_words)
def tf_idf(sample_list, bagOfWords):
    D = len(sample_list)
    w_ij = []
    for i, sample in enumerate(sample_list):
        words = sample.split()
        tf_i = {word: words.count(word) / len(words) for word in bagOfWords}
        df_i = {word: sum(1 for s in sample_list if word in s) for word in bagOfWords}
        w_i = {word: tf_i[word] * math.log(D / df_i[word]) for word in bagOfWords}
        w_ij.append(w_i)
    return w_ij

In [None]:
def cosine_similarity(v1, v2):
    dot_product = sum(x * y for x, y in zip(v1, v2))
    magnitude_v1 = math.sqrt(sum(x ** 2 for x in v1))
    magnitude_v2 = math.sqrt(sum(x ** 2 for x in v2))
    if magnitude_v1 == 0 or magnitude_v2 == 0:
        return 0.0
    return dot_product / (magnitude_v1 * magnitude_v2)
def rank(w_ij):
    rank_dic = {}
    for i in range(1, len(w_ij)):
        similarity = cosine_similarity(w_ij[0], w_ij[i])
        rank_dic[i] = similarity
    return {k: v for k, v in sorted(rank_dic.items(), key=lambda item: item[1], reverse=True)}
def show_result(rank_dic, n_mails, email_dic):
    top_n = list(rank_dic.keys())[:n_mails]
    subject = list(email_dic.keys())[0]
    print(f"Subject of email as: {subject} ...")
    for i in top_n:
        print(f"Email {i}: {list(email_dic.keys())[i]}")

In [None]:
sample1 = normalize_data(sample1, stop_words)
sample2 = normalize_data(sample2, stop_words)
line1 = normalize_data(line1, stop_words)
line2 = normalize_data(line2, stop_words)

sample_list = [sample1, sample2, line1]

bow = bag_of_words(sample_list)

w_ij = tf_idf(sample_list, bow)

results = rank(w_ij)
print(results)