In [133]:
import pandas as pd
import matplotlib.pyplot as plt  
import seaborn as sns     
import numpy as np
import re
from collections import Counter
from nltk.corpus import stopwords
import ast
from dateutil.parser import parse
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from IPython.display import display
import warnings
import string
warnings.filterwarnings("ignore")

In [134]:
df = pd.read_csv("ready_for_analysis.csv")
df.head()

Unnamed: 0,label,Subject,is_forwarded,email_body,subject
0,0,christmas tree farm pictures,0,<NO_BODY>,christmas tree farm pictures
1,0,re : rankings,0,thank you .,re : rankings
2,0,leadership development pilot,1,"sally : what timing , ask and you shall receiv...",leadership development pilot
3,0,key dates and impact of upcoming sap implement...,0,"over the next few weeks , project apollo and b...",key dates and impact of upcoming sap implement...
4,0,key hr issues going forward,0,a ) year end reviews - report needs generating...,key hr issues going forward


In [135]:
def normalize_case(text):
    if pd.isna(text) or text == "<NO_BODY>":
        return ""
    return text.lower()

In [136]:
def replace_links(text):
    if pd.isna(text) or text == "":
        return ""
    text = re.sub(
        r"https?:\/\/\S+\.(com|fr|net|org|info|biz|co|uk|de|ru|cn|in|us|eu|tv|me|io|xyz|top|site|online|tech|store|app|live|news|email|click|link|space|website|club|today|world|digital|center|company|services|solutions|network|media|group|systems|agency|capital|finance|marketing|support|cloud|shop|blog|web|page|works|tools|zone|plus|press|fun|cool|expert|global|life|love|money|name|social|team|tips|trade|video|vip|wiki|win|work|pro|rocks|review|sale|school|show|studio|style|watch|webcam)\b\S*",
        "<LINK>",
        text,
        flags=re.IGNORECASE
    )
    text = re.sub(
        r"\b\S+\.(com|fr|net|org|info|biz|co|uk|de|ru|cn|in|us|eu|tv|me|io|xyz)\b",
        "<LINK>",
        text,
        flags=re.IGNORECASE
    )
    text = re.sub(
        r"\b(http|https|www|com)\b",
        "<LINK>",
        text,
        flags=re.IGNORECASE
    )

    return text.strip()

In [137]:
df['email_clean'] = df['email_body'].apply(normalize_case)
df['email_clean'] = df['email_clean'].apply(replace_links)

In [138]:
link_count = df['email_clean'].str.contains(r'<LINK>', regex=True).sum()
print(f"Emails containing <LINK>: {link_count}")

Emails containing <LINK>: 8929


In [139]:
df.head()

Unnamed: 0,label,Subject,is_forwarded,email_body,subject,email_clean
0,0,christmas tree farm pictures,0,<NO_BODY>,christmas tree farm pictures,
1,0,re : rankings,0,thank you .,re : rankings,thank you .
2,0,leadership development pilot,1,"sally : what timing , ask and you shall receiv...",leadership development pilot,"sally : what timing , ask and you shall receiv..."
3,0,key dates and impact of upcoming sap implement...,0,"over the next few weeks , project apollo and b...",key dates and impact of upcoming sap implement...,"over the next few weeks , project apollo and b..."
4,0,key hr issues going forward,0,a ) year end reviews - report needs generating...,key hr issues going forward,a ) year end reviews - report needs generating...


In [140]:
def find_valid_dates(text):
    if pd.isna(text):
        return []
    raw_matches = re.findall(
        r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}-\d{2}-\d{2}|\d{1,2}\s+\w+\s+\d{2,4}|\w+\s+\d{1,2},\s+\d{4})\b',
        text
    )

    # Validate using dateutil
    valid_dates = []
    for match in raw_matches:
        try:
            parse(match, fuzzy=False)
            valid_dates.append(match)
        except:
            continue

    return valid_dates

In [141]:
df['dates_in_subject'] = df['Subject'].apply(find_valid_dates)
df['dates_in_email'] = df['email_clean'].apply(find_valid_dates)

In [142]:
total_dates = df['dates_in_subject'].apply(len).sum() + df['dates_in_email'].apply(len).sum()
print(f"Total valid date mentions: {total_dates}")

Total valid date mentions: 1676


In [143]:
def normalize_dates(text):
    if pd.isna(text):
        return text
    raw_matches = re.findall(
        r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}-\d{2}-\d{2}|\d{1,2}\s+\w+\s+\d{2,4}|\w+\s+\d{1,2},\s+\d{4})\b',
        text
    )
    for match in raw_matches:
        try:
            parse(match, fuzzy=False)
            text = text.replace(match, '<DATE>')
        except:
            continue

    return text
df['email_clean'] = df['email_clean'].apply(normalize_dates)
df['Subject'] = df['Subject'].apply(normalize_dates)

In [144]:
def normalize_numbers(text):
    if pd.isna(text):
        return text
    return re.sub(r'\b\d+(\.\d+)?\b', '<NUMBER>', text)

In [145]:
df['email_clean'] = df['email_clean'].apply(normalize_numbers)

In [146]:
df = df.drop(columns=["dates_in_email", "dates_in_subject"])

In [147]:
df['email_clean'] = df['email_clean'].dropna().apply(lambda x: re.sub(money_pattern, '<MONEY>', x, flags=re.IGNORECASE))
df['Subject'] = df['Subject'].dropna().apply(lambda x: re.sub(money_pattern, '<MONEY>', x, flags=re.IGNORECASE))

In [148]:
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'

email_matches_count = df['email_clean'].dropna().apply(lambda x: len(re.findall(email_pattern, x))).sum()
subject_matches_count = df['Subject'].dropna().apply(lambda x: len(re.findall(email_pattern, x))).sum()

total_email_mentions = email_matches_count + subject_matches_count
print(f"Total email mentions: {total_email_mentions}")

Total email mentions: 0


In [149]:
df['email_clean'] = df['email_clean'].dropna().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df['Subject'] = df['Subject'].dropna().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))


In [150]:
df['email_clean'] = df['email_clean'].dropna().apply(lambda x: re.sub(r'([!?.,])\1+', r'\1', x))
df['Subject'] = df['Subject'].dropna().apply(lambda x: re.sub(r'([!?.,])\1+', r'\1', x))

In [151]:
df['email_clean'] = df['email_clean'].dropna().apply(lambda x: re.sub(r'[,";:(){}[\]]+', '', x))
df['Subject'] = df['Subject'].dropna().apply(lambda x: re.sub(r'[,";:(){}[\]]+', '', x))

In [152]:
df.head()

Unnamed: 0,label,Subject,is_forwarded,email_body,subject,email_clean
0,0,christmas tree farm pictures,0,<NO_BODY>,christmas tree farm pictures,
1,0,re rankings,0,thank you .,re : rankings,thank you
2,0,leadership development pilot,1,"sally : what timing , ask and you shall receiv...",leadership development pilot,sally what timing ask and you shall receive ...
3,0,key dates and impact of upcoming sap implement...,0,"over the next few weeks , project apollo and b...",key dates and impact of upcoming sap implement...,over the next few weeks project apollo and be...
4,0,key hr issues going forward,0,a ) year end reviews - report needs generating...,key hr issues going forward,a year end reviews report needs generating l...


In [153]:
df = df.drop(columns = ["subject","email_body"])

In [158]:
df = df.fillna('')
text_cols = ['Subject', 'email_clean']  # add other text columns if needed
df[text_cols] = df[text_cols].apply(lambda x: x.str.strip())
df.to_csv('ready_for_training.csv', index=False)