In [1]:
import pandas as pd
from dao.email import DAOEmail, DAOEmailGmail
from models.email import Email, EmailGithubDataset
from dateutil import parser as date_parser
import os
import re

In [2]:
file_path = '../data/Email/English/raw/spam_email_dataset/emails.csv'
df = pd.read_csv(file_path)
print(df.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [3]:
# get first entry from df
print(df.iloc[0].text)

Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  mar

In [4]:
dao = DAOEmail("email_spam_dataset")

# iterate row by row
for index, row in df.iterrows():
    splitted = row.text.split("  ")
    subject = splitted[0].replace("Subject: ", "")
    body = " ".join(splitted[1:])
    is_spam = (int(row.spam) == 1)
    email = Email(subject=subject, body=body, is_html=False, is_spam=is_spam)
    dao.insert_one(email)

In [46]:
from models.email import EmailGithubDataset, GithubClassEnums
from email import message_from_file


def parse_email(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        msg = message_from_file(file)

        from_address = msg.get('From')
        to_address = msg.get('To')
        date_str = msg.get('Date')
        date = None
        if date_str:
            date = date_parser.parse(date_str)


        subject = msg.get('Subject')


        body = ""
        is_html = False
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                body += part.get_payload()
            elif part.get_content_type() == 'text/html':
                body += part.get_payload()
                is_html = True

        return EmailGithubDataset(
            from_address=from_address,
            to_address=to_address,
            date=date,
            subject=subject,
            body=body,
            is_html=is_html,
            is_spam=None,  # Set this based on your spam detection logic
            is_ai_generated=None,  # Set this based on your AI generation detection logic
            inner_classification=GithubClassEnums.CALENDAR.value
        )

In [8]:
folder_path = "../data/Email/English/raw/Email-Classification-github/dataset/calendar"
dao = DAOEmail("email_classification_github")
for filename in os.listdir(folder_path):
    if filename.isnumeric():
        file_path = os.path.join(folder_path, filename)
        email_data = parse_email(file_path)
        dao.insert_one(email_data)


In [13]:
file_path = '../data/Email/English/raw/email_classification_dataset/spam_assassin.csv'
df = pd.read_csv(file_path)
print(df.head())

                                                text  target
0  From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...       0
1  From gort44@excite.com Mon Jun 24 17:54:21 200...       1
2  From fork-admin@xent.com Mon Jul 29 11:39:57 2...       1
3  From dcm123@btamail.net.cn Mon Jun 24 17:49:23...       1
4  From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...       0


In [10]:
print(df.iloc[0].text)



In [71]:

from datetime import datetime

def parse_email2(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        email_data = file.read()
        # Regular expressions to extract information
    from_pattern = re.compile(r'From [\w\-\.]+@([\w\-]+\.)+[\w\-]{2,4}')
    to_pattern = re.compile(r'Delivered-To: [\w\-\.]+@([\w\-]+\.)+[\w\-]{2,4}')
    date_pattern = re.compile(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2}\s\d{2}:\d{2}:\d{2}\s\d{4}\b')
    subject_pattern = re.compile(r'Subject: (.+)')
    content_type = re.compile(r'Content-Type: .+?(?=;)')

    # Extracting information using regular expressions
    from_match = from_pattern.search(email_data)
    to_match = to_pattern.search(email_data)
    date_match = date_pattern.search(email_data)
    subject_match = subject_pattern.search(email_data)

    subject = subject_match.group(0).replace("Subject: ", "") if subject_match else None
    if subject:
        digit_deleted = False
        while subject[-1].isnumeric() or (subject[-1] == "." and digit_deleted): # Remove trailing numbers and dots
            subject = subject[:-1]
            digit_deleted = True

        subject = subject.strip()


    content_type_match = content_type.search(email_data)
    if content_type_match:
        is_html = ("text/html" in content_type_match.group(0))
    else:
        is_html = False

    lines = email_data.split("\n")
    body_flag = False
    body_content = ""
    for i, line in enumerate(lines):
        if line == "" and not body_flag:
            body_flag = True
        elif body_flag:
            body_content += (line + "\n")

    # Creating the EmailBase instance
    email_instance = Email(
        from_address=from_match.group(0).replace("From ", "") if from_match else None,
        to_address=to_match.group(0).replace("Delivered-To: ", "") if to_match else None,
        date=datetime.strptime(date_match.group(0), "%b %d %H:%M:%S %Y") if date_match else None,
        subject=subject,
        body=body_content,
        is_html=is_html,  # You may need to implement HTML detection logic
        is_spam=None,   # You may need to implement spam detection logic
        is_ai_generated=None  # You may need to implement AI-generated detection logic
    )

    return email_instance

In [80]:
folder_path = "../data/Email/English/raw/email_classification_dataset/2005spam_2"
dao = DAOEmail("email_spam_assassin_dataset")
for filename in os.listdir(folder_path):
    if filename.split(".")[0].isnumeric():
        file_path = os.path.join(folder_path, filename)
        email_data = parse_email2(file_path)
        email_data.is_spam = True
        dao.insert_one(email_data)
    # dao.insert_one(email_data)

In [None]:
from pymongo.errors import DocumentTooLarge
from analysis.mbox_reader import GmailMboxMessage
import mailbox

mbox_obj_gmail1 = mailbox.mbox('../data/Email/Personal/gmail1.mbox')
# mbox_obj_gmail2 = mailbox.mbox('../data/Email/Personal/gmail2.mbox')
# mbox_obj_gmail3 = mailbox.mbox('../data/Email/Personal/gmail3.mbox')


dao_gmail1 = DAOEmail("gmail1")
# dao_gmail2 = DAOEmail("gmail2")
# dao_gmail3 = DAOEmail("gmail3")
counter1= 0
# counter2= 0
# counter3= 0
# for idx, email_obj in enumerate(mbox_obj_gmail2):
#     email_data = GmailMboxMessage(email_obj)
#     gmail_model = email_data.parse_to_email_model()
#     try:
#         dao_gmail2.insert_one(gmail_model)
#     except UnicodeEncodeError:
#         counter2 += 1
#
# print(counter2)
#
# for idx, email_obj in enumerate(mbox_obj_gmail3):
#     email_data = GmailMboxMessage(email_obj)
#     gmail_model = email_data.parse_to_email_model()
#     try:
#         dao_gmail3.insert_one(gmail_model)
#     except UnicodeEncodeError:
#         counter3 += 1
#
# print(counter3)
for idx, email_obj in enumerate(mbox_obj_gmail1):
    email_data = GmailMboxMessage(email_obj)
    try:
        gmail_model = email_data.parse_to_email_model()
    except:
        counter1 += 1
        continue
    try:
        dao_gmail1.insert_one(gmail_model)
    except UnicodeEncodeError:
        counter1 += 1
    except DocumentTooLarge:
        counter1 += 1
        print("Document too large")

print(counter1)

Document too large
Document too large
Document too large
Document too large
Document too large
Document too large
Document too large
Document too large
Document too large
Document too large
Document too large


In [5]:
from analysis.attribute_retriving import extract_strings_from_html, detect_language

daos = [DAOEmailGmail("gmail1"), DAOEmailGmail("gmail2"), DAOEmailGmail("gmail3")]
for dao in daos:
    emails = dao.find_all()
    for email in emails:
        ext = extract_strings_from_html(email.body)
        detected_lang = detect_language(ext)
        dao.update_one({"_id":email.id}, {"$set": {"detected_lang": detected_lang}})

AttributeError: 'list' object has no attribute 'replace'