#### Importing necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot

## Email Data

In [2]:
raw_df = pd.read_json("../data.json")
raw_df.head(10)

Unnamed: 0,messageID,name,senderEmail,subject,date,summary
0,197cfc8937b8783a,MongoDB Cloud,cloud-manager-support@mongodb.com,Successful sign-in for tusharmalhan2564@gmail....,"Thu, 3 Jul 2025 10:15:16 +0000",We&#39;re verifying a recent sign-in for tusha...
1,197cfa018674ba34,LinkedIn,notifications-noreply@linkedin.com,You appeared in 3 searches,"Thu, 3 Jul 2025 09:31:04 +0000 (UTC)",See who searched for your profile ͏ ͏ ͏ ͏ ͏ ͏ ...
2,197cf7635329e0bf,Tushar Malhan,tusharmalhan2564@gmail.com,Re: JD | Intern - Full Stack Developer | Vappco,"Thu, 3 Jul 2025 14:15:19 +0530","Dear Pushti, Thank you for sharing the Job Des..."
3,197cf5509cf808fb,pushti.pathania@vappco.com,pushti.pathania@vappco.com,JD | Intern - Full Stack Developer | Vappco,"Thu, 3 Jul 2025 08:08:53 +0000","Dear Tushar, It was a pleasure speaking with y..."
4,197cf52d3cb22f49,Adobe India Hackathon,noreply@dare2compete.news,"Tushar, Adobe is looking for you! Internship O...","Thu, 03 Jul 2025 13:36:40 +0530",Adobe India Hackathon - Apply now! ﻿ ͏ ﻿ ͏ ﻿ ͏...
5,197cf35863fade10,Adobe India Hackathon,noreply@dare2compete.news,Adobe is hiring interns at INR 1 lakh/Month | ...,"Thu, 03 Jul 2025 13:04:40 +0530","Tushar, increase your chances of selection! ﻿ ..."
6,197cefee67b68ba0,Unstop Insights,noreply@dare2compete.news,[Last Day] IBM’s SkillsBuild virtual internshi...,"Thu, 03 Jul 2025 12:05:01 +0530",Less than 24 hours to go | 4000 seats ﻿ ͏ ﻿ ͏ ...
7,197cee8a247560c1,Zomato,noreply@mailers.zomato.com,"Powered through the week? Now, power up at Cos...","Thu, 03 Jul 2025 06:10:41 +0000 (UTC)",You&#39;ve been in grind mode all week. Time t...
8,197cecaddc2cca3b,Amazon.in,store-news@amazon.in,We found something you might like,"Thu, 3 Jul 2025 05:38:10 +0000",How about another look? Keep shopping your rec...
9,197ce887aad98b83,Unstop Hiring Hub,noreply@dare2compete.news,"Your skills are in demand, Tushar","Thu, 03 Jul 2025 09:55:40 +0530",Let&#39;s match them with the right roles. ﻿ ͏...


### Pre-Processing / Cleaning the data

In [3]:
raw_df["text"] = raw_df["subject"].fillna("") + " " + raw_df["summary"]
# Dropping uneccessary columns
df = raw_df.drop(columns=["name", "senderEmail", "subject", "date", "summary"])
df.reset_index(drop = True, inplace = True)
df.head(10)

Unnamed: 0,messageID,text
0,197cfc8937b8783a,Successful sign-in for tusharmalhan2564@gmail....
1,197cfa018674ba34,You appeared in 3 searches See who searched fo...
2,197cf7635329e0bf,Re: JD | Intern - Full Stack Developer | Vappc...
3,197cf5509cf808fb,JD | Intern - Full Stack Developer | Vappco De...
4,197cf52d3cb22f49,"Tushar, Adobe is looking for you! Internship O..."
5,197cf35863fade10,Adobe is hiring interns at INR 1 lakh/Month | ...
6,197cefee67b68ba0,[Last Day] IBM’s SkillsBuild virtual internshi...
7,197cee8a247560c1,"Powered through the week? Now, power up at Cos..."
8,197cecaddc2cca3b,We found something you might like How about an...
9,197ce887aad98b83,"Your skills are in demand, Tushar Let&#39;s ma..."


### Using 10 predefined labels to weakly classify email data


#### Predefined Classes : 
##### 1. Food Orders
##### 2. Google Forms
##### 3. Login Attempts
##### 4. Invoice Receipts
##### 5. News Letters
##### 6. Travel Bookings
##### 7. LinkedIn
##### 8. Promotions
##### 9. Social Media
##### 10. Others

<br>

### Initially classifing with weak labels with basic if else statements


In [4]:
def weak_classify(email_text):
    text = email_text.lower()
    if "swiggy" in text or "zomato" in text or "blinkit" in text or "zepto" in text or "uber eats" in text or "delivered" in text or "order" in text:
        return "FoodOrders"
    elif "form submission" in text or "response recorded" in text or "google form" in text:
        return "GoogleForm"
    elif "login attempt" in text or "new sign-in" in text or "security alert" in text:
        return "LoginAttempt"
    elif "invoice" in text or "receipt" in text or "payment successful" in text or "transaction" in text:
        return "InvoiceReceipt"
    elif "digest" in text or "quora" in text or "newsletter" in text or "weekly update" in text or "top stories" in text:
        return "NewsLetter"
    elif "flight" in text or "booking" in text or "pnr" in text or "hotel" in text or "reservation" in text:
        return "TravelBooking"
    elif "linkedin" in text or "profile views" in text or "connection request" in text or "job alert" in text:
        return "LinkedIn"
    elif "sale" in text or "offer" in text or "discount" in text or "deal" in text:
        return "Promotions"
    elif "facebook" in text or "instagram" in text or "twitter" in text or "like" in text or "comment" in text or "threads" in text or "reddit" in text:
        return "SocialMedia"
    else:
        return "Others"

#### Using weak_classify to assign labels to unlabeled raw data

In [5]:
df["label"] = df["text"].apply(weak_classify)
df.head(10)

Unnamed: 0,messageID,text,label
0,197cfc8937b8783a,Successful sign-in for tusharmalhan2564@gmail....,Others
1,197cfa018674ba34,You appeared in 3 searches See who searched fo...,Others
2,197cf7635329e0bf,Re: JD | Intern - Full Stack Developer | Vappc...,Others
3,197cf5509cf808fb,JD | Intern - Full Stack Developer | Vappco De...,Others
4,197cf52d3cb22f49,"Tushar, Adobe is looking for you! Internship O...",Others
5,197cf35863fade10,Adobe is hiring interns at INR 1 lakh/Month | ...,Others
6,197cefee67b68ba0,[Last Day] IBM’s SkillsBuild virtual internshi...,Others
7,197cee8a247560c1,"Powered through the week? Now, power up at Cos...",FoodOrders
8,197cecaddc2cca3b,We found something you might like How about an...,SocialMedia
9,197ce887aad98b83,"Your skills are in demand, Tushar Let&#39;s ma...",Others


### Emails form each weak label

In [6]:
print(df["label"].value_counts())

label
Others            368
Promotions         35
NewsLetter         28
FoodOrders         23
SocialMedia        21
LinkedIn           14
GoogleForm          6
InvoiceReceipt      3
TravelBooking       1
LoginAttempt        1
Name: count, dtype: int64


### Removing Weak Labels with very small count beacause they don't really help with classification

In [7]:
counts = df["label"].value_counts();
small_classes = []
for label, count in counts.items():
    if count < 20:
        small_classes.append(label)

df["label"] = df["label"].apply(lambda x: "Others" if x in small_classes else x)

### Downsample "Others" class as it count is much greater than any other class

In [8]:
from sklearn.utils import resample
df_majority = df[df.label == "Others"]
df_minority = df[df.label != "Others"]

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=40, 
    random_state=10
)

df_balanced = pd.concat([df_minority, df_majority_downsampled])
print(df_balanced["label"].value_counts())

label
Others         40
Promotions     35
NewsLetter     28
FoodOrders     23
SocialMedia    21
Name: count, dtype: int64


### Splitting the data into train and test

In [9]:
from sklearn.model_selection import train_test_split
X = df_balanced["text"]
y = df_balanced["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

### Training a logistic regression model to predict labels for future emails instead of relying on weak labels

<br/>

##### -- Since logistic regression can't directly understand words we need to use something called word embedding that converts words in vectors, more specifically using TF-IDF (Term Frequency Inverse Document Frequency) word embedding which is a frequency based word embedding. TF-IDF calculates how rare a frequent word in a document is in a corpus of documents, can be used to fugure out the keywords in a document.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=10)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

### Performance Report

In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  FoodOrders       1.00      0.67      0.80         3
  NewsLetter       0.86      1.00      0.92         6
      Others       0.64      1.00      0.78         7
  Promotions       0.80      0.67      0.73         6
 SocialMedia       1.00      0.62      0.77         8

    accuracy                           0.80        30
   macro avg       0.86      0.79      0.80        30
weighted avg       0.85      0.80      0.80        30



### Example Data

In [29]:
examples = [
    "Your Swiggy order #1234 has been delivered successfully. Enjoy your meal!",
    "Your response has been recorded for Google Form: Feedback Survey 2025.",
    "New sign-in attempt detected on your account from a new device. Was this you?",
    "Quora Digest: Top stories for you this week — Discover trending questions and answers.",
]

example_vecs = vectorizer.transform(examples)
predicted_labels = clf.predict(example_vecs)
for text, label in zip(examples, predicted_labels):
    print(f"Text: {text}")
    print(f"Predicted label: {label}")


Text: Your Swiggy order #1234 has been delivered successfully. Enjoy your meal!
Predicted label: FoodOrders
Text: Your response has been recorded for Google Form: Feedback Survey 2025.
Predicted label: Others
Text: New sign-in attempt detected on your account from a new device. Was this you?
Predicted label: Others
Text: Quora Digest: Top stories for you this week — Discover trending questions and answers.
Predicted label: NewsLetter
