#### Importing necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot

## Email Data

In [2]:
raw_df = pd.read_json("../data/base_data.json")
raw_df.head(10)

Unnamed: 0,messageID,name,senderEmail,subject,date,summary
0,197d54c584e19ddf,Updates from GirlScript Foundation,gssoc@substack.com,Postman Challenge is Back!!,"Fri, 4 Jul 2025 11:55:59 +0000",GirlScript Foundation collaborating with Postm...
1,197d53183de6fe84,Zomato Order,noreply@zomato.com,Your Zomato order from Bharat Chicken Inn Foods,"Fri, 04 Jul 2025 11:28:00 +0000 (UTC)","Hi Tushar Malhan, Thank you for ordering from ..."
2,197d524e7fcef449,Internshala,student@internshala.com,Top internships of the week matching your profile,"Fri, 4 Jul 2025 11:14:13 +0000","Tushar, earn high stipend in your preferred fi..."
3,197d4d0d614e6f9d,Internshala Trainings,trainings@updates.internshala.com,We have an update on your upcoming journey,"Fri, 04 Jul 2025 09:42:24 +0000 (UTC)",Internshala Trainings Internshala Trainings In...
4,197d4cf0c5b6ecdc,faheemahmed ahmed,faheemahmed@adgips.ac.in,Fwd: Spreadsheet shared with you: ‘Company Pre...,"Fri, 4 Jul 2025 15:10:15 +0530",---------- Forwarded message --------- From: S...
5,197d4b956b9744f9,Coding Ninjas,mailer@certifications.codingninjas.com,Update: Confirm your application for IIT-certi...,"Fri, 04 Jul 2025 09:16:44 +0000",Designed with IIT faculty - Click here to know...
6,197d4b03262134d2,Quora Digest,english-quora-digest@quora.com,I went for a TCS drive on 3rd May 2025. I clea...,"Fri, 04 Jul 2025 09:06:45 +0000",I went for a TCS drive on 3rd May 2025. I clea...
7,197d3cb9645eb64f,Adobe Acrobat,mail@mail.adobe.com,Make filling out forms feel better​ 📝,"Thu, 03 Jul 2025 21:56:22 -0700",Take the pain out of paperwork using Acrobat P...
8,197d3b6ef45dcb03,Unstop Competitions,noreply@dare2compete.news,Bag a PPI with a ₹ 1 Lakh stipend per month! |...,"Fri, 04 Jul 2025 10:04:30 +0530",Register now ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿...
9,197d3aef663673e7,Unstop Hiring Hub,noreply@dare2compete.news,"18,000+ applicants are already in the race","Fri, 04 Jul 2025 09:55:48 +0530",Apply Now! ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏ ﻿ ͏...


### Pre-Processing / Cleaning the data

In [3]:
raw_df["text"] = raw_df["subject"].fillna("") + " " + raw_df["summary"]
# Dropping uneccessary columns
df = raw_df.drop(columns=["name", "senderEmail", "subject", "date", "summary"])
df.reset_index(drop = True, inplace = True)
df.head(10)

Unnamed: 0,messageID,text
0,197d54c584e19ddf,Postman Challenge is Back!! GirlScript Foundat...
1,197d53183de6fe84,Your Zomato order from Bharat Chicken Inn Food...
2,197d524e7fcef449,Top internships of the week matching your prof...
3,197d4d0d614e6f9d,We have an update on your upcoming journey Int...
4,197d4cf0c5b6ecdc,Fwd: Spreadsheet shared with you: ‘Company Pre...
5,197d4b956b9744f9,Update: Confirm your application for IIT-certi...
6,197d4b03262134d2,I went for a TCS drive on 3rd May 2025. I clea...
7,197d3cb9645eb64f,Make filling out forms feel better​ 📝 Take the...
8,197d3b6ef45dcb03,Bag a PPI with a ₹ 1 Lakh stipend per month! |...
9,197d3aef663673e7,"18,000+ applicants are already in the race App..."


### Using 10 predefined labels to weakly classify email data


#### Predefined Classes : 
##### 1. Food Orders
##### 2. Google Forms
##### 3. Login Attempts
##### 4. Invoice Receipts
##### 5. News Letters
##### 6. Travel Bookings
##### 7. LinkedIn
##### 8. Promotions
##### 9. Social Media
##### 10. Others

<br>

### Initially classifing with weak labels with basic if else statements


In [4]:
def weak_classify(email_text):
    text = email_text.lower()
    if "swiggy" in text or "zomato" in text or "blinkit" in text or "zepto" in text or "uber eats" in text or "delivered" in text or "order" in text:
        return "FoodOrders"
    elif "form submission" in text or "response recorded" in text or "google form" in text:
        return "GoogleForm"
    elif "login attempt" in text or "new sign-in" in text or "security alert" in text:
        return "LoginAttempt"
    elif "invoice" in text or "receipt" in text or "payment successful" in text or "transaction" in text:
        return "InvoiceReceipt"
    elif "digest" in text or "quora" in text or "newsletter" in text or "weekly update" in text or "top stories" in text:
        return "NewsLetter"
    elif "flight" in text or "booking" in text or "pnr" in text or "hotel" in text or "reservation" in text:
        return "TravelBooking"
    elif "linkedin" in text or "profile views" in text or "connection request" in text or "job alert" in text:
        return "LinkedIn"
    elif "sale" in text or "offer" in text or "discount" in text or "deal" in text:
        return "Promotions"
    elif "facebook" in text or "instagram" in text or "twitter" in text or "like" in text or "comment" in text or "threads" in text or "reddit" in text:
        return "SocialMedia"
    else:
        return "Others"

#### Using weak_classify to assign labels to unlabeled raw data

In [5]:
df["label"] = df["text"].apply(weak_classify)
df.head(10)

Unnamed: 0,messageID,text,label
0,197d54c584e19ddf,Postman Challenge is Back!! GirlScript Foundat...,Others
1,197d53183de6fe84,Your Zomato order from Bharat Chicken Inn Food...,FoodOrders
2,197d524e7fcef449,Top internships of the week matching your prof...,Others
3,197d4d0d614e6f9d,We have an update on your upcoming journey Int...,FoodOrders
4,197d4cf0c5b6ecdc,Fwd: Spreadsheet shared with you: ‘Company Pre...,Others
5,197d4b956b9744f9,Update: Confirm your application for IIT-certi...,LinkedIn
6,197d4b03262134d2,I went for a TCS drive on 3rd May 2025. I clea...,Others
7,197d3cb9645eb64f,Make filling out forms feel better​ 📝 Take the...,Others
8,197d3b6ef45dcb03,Bag a PPI with a ₹ 1 Lakh stipend per month! |...,Others
9,197d3aef663673e7,"18,000+ applicants are already in the race App...",Others


### Emails form each weak label

In [6]:
print(df["label"].value_counts())

label
Others            368
Promotions         35
NewsLetter         28
FoodOrders         23
SocialMedia        21
LinkedIn           14
GoogleForm          6
InvoiceReceipt      3
TravelBooking       1
LoginAttempt        1
Name: count, dtype: int64


### Removing Weak Labels with very small count beacause they don't really help with classification

In [6]:
counts = df["label"].value_counts();
small_classes = []
for label, count in counts.items():
    if count < 20:
        small_classes.append(label)

df["label"] = df["label"].apply(lambda x: "Others" if x in small_classes else x)

### Downsample "Others" class as it count is much greater than any other class

In [7]:
from sklearn.utils import resample
df_majority = df[df.label == "Others"]
df_minority = df[df.label != "Others"]

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=40, 
    random_state=10
)

df_balanced = pd.concat([df_minority, df_majority_downsampled])
print(df_balanced["label"].value_counts())

label
Others         40
Promotions     33
NewsLetter     28
FoodOrders     22
SocialMedia    20
Name: count, dtype: int64


### Splitting the data into train and test

In [8]:
from sklearn.model_selection import train_test_split
X = df_balanced["text"]
y = df_balanced["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

### Training a logistic regression model to predict labels for future emails instead of relying on weak labels

<br/>

##### -- Since logistic regression can't directly understand words we need to use something called word embedding that converts words in vectors, more specifically using TF-IDF (Term Frequency Inverse Document Frequency) word embedding which is a frequency based word embedding. TF-IDF calculates how rare a frequent word in a document is in a corpus of documents, can be used to fugure out the keywords in a document.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=10)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

### Performance Report

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  FoodOrders       1.00      0.75      0.86         4
  NewsLetter       0.75      0.75      0.75         4
      Others       0.45      0.83      0.59         6
  Promotions       1.00      0.67      0.80         9
 SocialMedia       1.00      0.83      0.91         6

    accuracy                           0.76        29
   macro avg       0.84      0.77      0.78        29
weighted avg       0.85      0.76      0.78        29



### Example Data

In [13]:
examples = [
    "Your Swiggy order #1234 has been delivered successfully. Enjoy your meal!",
    "Your response has been recorded for Google Form: Feedback Survey 2025.",
    "New sign-in attempt detected on your account from a new device. Was this you?",
    "Quora Digest: Top stories for you this week — Discover trending questions and answers.",
]

example_vecs = vectorizer.transform(examples)
predicted_labels = clf.predict(example_vecs)
for text, label in zip(examples, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted label: {label}")


Text: Your Swiggy order #1234 has been delivered successfully. Enjoy your meal!
Predicted label: FoodOrders
Text: Your response has been recorded for Google Form: Feedback Survey 2025.
Predicted label: Others
Text: New sign-in attempt detected on your account from a new device. Was this you?
Predicted label: Others
Text: Quora Digest: Top stories for you this week — Discover trending questions and answers.
Predicted label: NewsLetter
