#### Importing necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot

## Email Data

In [2]:
raw_df = pd.read_json("../data/emails.json")
raw_df.head(10)

Unnamed: 0,messageID,name,senderEmail,subject,date,summary
0,197eb4e791b9e5ba,Dell Technologies,Dell_Technologies@americas.comm.dell.com,We wanted you to know first.,"Tue, 08 Jul 2025 12:31:17 -0600",Save $800 off inside! ﻿͏ ﻿͏ ﻿͏ ﻿͏ ﻿͏ ﻿͏ ﻿͏ ﻿͏ ...
1,197eb2fadd960b30,Raj on Facebook,close_friend_updates@facebookmail.com,💬 Raj Puri commented on a post,"Tue, 8 Jul 2025 10:57:23 -0700",💬 Raj Puri commented on a post . 7 July at 14:...
2,197ea9ce074735bd,,ibmtrain@us.ibm.com,Badge survey for Badge Getting Started with Ar...,"Tue, 08 Jul 2025 15:13:48 +0000 (UTC)","This is a system generated email, please do no..."
3,197ea991ae6d866d,Quora Suggested Spaces,sonalisspacemozmasti-space@quora.com,औरत की योनि और स्तन उनके शरीर का ऐसा भाग है जि...,"Tue, 08 Jul 2025 15:13:10 +0000",औरत की योनि और स्तन उनके शरीर का ऐसा भाग है जि...
4,197ea8235754f069,SCALIS EarlyCareers,internships@apexearlycareers.com,Jobs Hiring Now,"Tue, 08 Jul 2025 14:48:10 +0000 (UTC)",Over 500+ open positions ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌...
5,197ea6564563aa53,Render,no-reply@render.com,deploy failed for MailMaid_Server,"Tue, 08 Jul 2025 14:16:42 +0000",Render We encountered an error during the depl...
6,197ea61c5199df28,Render,no-reply@render.com,deploy failed for MailMaid_Server,"Tue, 08 Jul 2025 14:12:44 +0000",Render We encountered an error during the depl...
7,197ea3b5ac1fe16a,LinkedIn,messages-noreply@linkedin.com,"Tushar, add Navya Goel - Apprentice","Tue, 8 Jul 2025 13:30:46 +0000 (UTC)",Data Science | Front-End | Infosys Pragati: Co...
8,197ea2068963e734,Ayush Tripathi,ayushtripathi01.work@gmail.com,Fwd: OneBanc is Hiring | 2026 Batch,"Tue, 8 Jul 2025 18:31:05 +0530",---------- Forwarded message --------- From: A...
9,197e9d589b7f1c82,HWI,hackwithinfy@infosys.com,HackWithInfy 2025 Qualifier - Welcome to the Test,"Tue, 8 Jul 2025 11:38:57 +0000","Dear Candidate, Greetings from Infosys! We are..."


### Pre-Processing / Cleaning the data

In [3]:
raw_df["text"] = raw_df["subject"].fillna("") + " " + raw_df["summary"]
# Dropping uneccessary columns
df = raw_df.drop(columns=["name", "senderEmail", "subject", "date", "summary"])
df.reset_index(drop = True, inplace = True)
df.head(10)

Unnamed: 0,messageID,text
0,197eb4e791b9e5ba,We wanted you to know first. Save $800 off ins...
1,197eb2fadd960b30,💬 Raj Puri commented on a post 💬 Raj Puri comm...
2,197ea9ce074735bd,Badge survey for Badge Getting Started with Ar...
3,197ea991ae6d866d,औरत की योनि और स्तन उनके शरीर का ऐसा भाग है जि...
4,197ea8235754f069,Jobs Hiring Now Over 500+ open positions ‌ ‌ ‌...
5,197ea6564563aa53,deploy failed for MailMaid_Server Render We en...
6,197ea61c5199df28,deploy failed for MailMaid_Server Render We en...
7,197ea3b5ac1fe16a,"Tushar, add Navya Goel - Apprentice Data Scien..."
8,197ea2068963e734,Fwd: OneBanc is Hiring | 2026 Batch ----------...
9,197e9d589b7f1c82,HackWithInfy 2025 Qualifier - Welcome to the T...


### Using 10 predefined labels to weakly classify email data


#### Predefined Classes : 
##### 1. Food Orders
##### 2. Google Forms
##### 3. Login Attempts
##### 4. Invoice Receipts
##### 5. News Letters
##### 6. Travel Bookings
##### 7. LinkedIn
##### 8. Promotions
##### 9. Social Media
##### 10. Others

<br>

### Initially classifing with weak labels with basic if else statements


In [4]:
from rapidfuzz import fuzz
import re
import json

with open("../data/label_keywords.json", "r", encoding="utf-8") as f:
    LABEL_KEYWORDS = json.load(f)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def weak_classify(text):
    text_clean = clean_text(text)
    for label, keywords in LABEL_KEYWORDS.items():
        for keyword in keywords:
            keyword_clean = clean_text(keyword)
            score = fuzz.partial_ratio(keyword_clean, text_clean)
            if score > 75:  # You can tune this threshold
                return label
    return "Others"

#### Using weak_classify to assign labels to unlabeled raw data

In [5]:
df["label"] = df["text"].apply(weak_classify)
df.head(10)

Unnamed: 0,messageID,text,label
0,197eb4e791b9e5ba,We wanted you to know first. Save $800 off ins...,Others
1,197eb2fadd960b30,💬 Raj Puri commented on a post 💬 Raj Puri comm...,SocialMedia
2,197ea9ce074735bd,Badge survey for Badge Getting Started with Ar...,SocialMedia
3,197ea991ae6d866d,औरत की योनि और स्तन उनके शरीर का ऐसा भाग है जि...,Others
4,197ea8235754f069,Jobs Hiring Now Over 500+ open positions ‌ ‌ ‌...,LinkedIn
5,197ea6564563aa53,deploy failed for MailMaid_Server Render We en...,LoginAttempt
6,197ea61c5199df28,deploy failed for MailMaid_Server Render We en...,LoginAttempt
7,197ea3b5ac1fe16a,"Tushar, add Navya Goel - Apprentice Data Scien...",Others
8,197ea2068963e734,Fwd: OneBanc is Hiring | 2026 Batch ----------...,SocialMedia
9,197e9d589b7f1c82,HackWithInfy 2025 Qualifier - Welcome to the T...,Promotions


### Emails form each weak label

In [6]:
print(df["label"].value_counts())

label
Others            672
SocialMedia       446
FoodOrders        361
Promotions        227
LoginAttempt      191
LinkedIn          162
TravelBooking     149
InvoiceReceipt    106
NewsLetter         94
GoogleForm         92
Name: count, dtype: int64


### Removing Weak Labels with very small count beacause they don't really help with classification

In [7]:
counts = df["label"].value_counts();
small_classes = []
for label, count in counts.items():
    if count < 40:
        small_classes.append(label)

df["label"] = df["label"].apply(lambda x: "Others" if x in small_classes else x)

### Downsample "Others" class as it count is much greater than any other class

In [8]:
from sklearn.utils import resample
# df_majority = df[df.label == "Others"]
# df_minority = df[df.label != "Others"]

majority_class = df['label'].value_counts().idxmax()
df_majority = df[df.label == majority_class]
df_minority = df[df.label != majority_class]

n_samples = 200
if len(df_majority) < n_samples:
    n_samples = len(df_majority)

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=n_samples, 
    random_state=10
)

df_balanced = pd.concat([df_minority, df_majority_downsampled])
print(df_balanced["label"].value_counts())

label
SocialMedia       446
FoodOrders        361
Promotions        227
Others            200
LoginAttempt      191
LinkedIn          162
TravelBooking     149
InvoiceReceipt    106
NewsLetter         94
GoogleForm         92
Name: count, dtype: int64


### Splitting the data into train and test

In [9]:
from sklearn.model_selection import train_test_split
X = df_balanced["text"]
y = df_balanced["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

### Training a logistic regression model to predict labels for future emails instead of relying on weak labels

<br/>

##### -- Since logistic regression can't directly understand words we need to use something called word embedding that converts words in vectors, more specifically using TF-IDF (Term Frequency Inverse Document Frequency) word embedding which is a frequency based word embedding. TF-IDF calculates how rare a frequent word in a document is in a corpus of documents, can be used to fugure out the keywords in a document.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(stop_words="english", max_features=2500)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=10)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

### Performance Report

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

    FoodOrders       0.82      0.72      0.77        65
    GoogleForm       0.48      0.67      0.56        15
InvoiceReceipt       0.48      0.65      0.56        23
      LinkedIn       0.76      0.67      0.71        33
  LoginAttempt       0.80      0.89      0.84        45
    NewsLetter       0.88      0.79      0.83        19
        Others       0.39      0.68      0.49        34
    Promotions       0.68      0.55      0.61        47
   SocialMedia       0.86      0.64      0.73        97
 TravelBooking       0.53      0.61      0.57        28

      accuracy                           0.68       406
     macro avg       0.67      0.69      0.67       406
  weighted avg       0.72      0.68      0.69       406



### Example Data

In [12]:
examples = [
    "Your Swiggy order #1234 has been delivered successfully. Enjoy your meal!",
    "Your response has been recorded for Google Form: Feedback Survey 2025.",
    "New sign-in attempt detected on your account from a new device. Was this you?",
    "Quora Digest: Top stories for you this week — Discover trending questions and answers.",
]

example_vecs = vectorizer.transform(examples)
predicted_labels = clf.predict(example_vecs)
for text, label in zip(examples, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted label: {label}")


Text: Your Swiggy order #1234 has been delivered successfully. Enjoy your meal!
Predicted label: FoodOrders
Text: Your response has been recorded for Google Form: Feedback Survey 2025.
Predicted label: GoogleForm
Text: New sign-in attempt detected on your account from a new device. Was this you?
Predicted label: LoginAttempt
Text: Quora Digest: Top stories for you this week — Discover trending questions and answers.
Predicted label: NewsLetter
