In [1]:
#training bert for email sentiment analysis
!pip install transformers torch scikit-learn pandas openpyxl



In [5]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/emails.csv")

# Check size
print(df.shape)

df = df[['message']]

df_sample = df.sample(n=20000, random_state=42)

df_sample['original_email'] = df_sample['message']

df_sample = df_sample.reset_index(drop=True)

#Extract email body
def extract_body(email):
    if isinstance(email, str):
        parts = email.split("\n\n", 1)
        if len(parts) > 1:
            return parts[1]
    return email
df_sample['email_body'] = df_sample['original_email'].apply(extract_body)

#cleaning text
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)        # remove URLs
    text = re.sub(r"\n", " ", text)            # remove newlines
    text = re.sub(r"[^a-z\s]", "", text)       # remove symbols
    text = re.sub(r"\s+", " ", text).strip()
    return text
df_sample['clean_email'] = df_sample['email_body'].apply(clean_text)

#removing empty rows
df_sample = df_sample[df_sample['clean_email'].str.len() > 20]
df_sample = df_sample.reset_index(drop=True)

#installing and importing vader
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

#initializing vader
sia = SentimentIntensityAnalyzer()

#get sentiment score
def get_sentiment_score(text):
    return sia.polarity_scores(text)['compound']
df_sample['sentiment_score'] = df_sample['clean_email'].apply(get_sentiment_score)

#converting score to sentiment label
def get_sentiment_label(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"
df_sample['sentiment'] = df_sample['sentiment_score'].apply(get_sentiment_label)

#final columns
df_sample = df_sample[['clean_email','sentiment']]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(517401, 2)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset

# Encode labels
label_encoder = LabelEncoder()
df_sample['sentiment'] = label_encoder.fit_transform(df_sample['sentiment'])

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_sample['clean_email'], df_sample['sentiment'], test_size=0.2, random_state=42, stratify=df_sample['sentiment']
)

# -----------------------------
# 2. TOKENIZATION
# -----------------------------
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(
    train_texts.tolist(),
    truncation=True,
    padding=True,
    max_length=128
)

test_encodings = tokenizer(
    test_texts.tolist(),
    truncation=True,
    padding=True,
    max_length=128
)

# -----------------------------
# 3. DATASET CLASS
# -----------------------------
class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
test_dataset = EmailDataset(test_encodings, test_labels)

# -----------------------------
# 4. LOAD MODEL
# -----------------------------
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

# -----------------------------
# 5. TRAINING CONFIG (LIGHT)
# -----------------------------
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/bert_results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# -----------------------------
# 6. TRAIN
# -----------------------------
trainer.train()

MODEL_DIR = "/content/drive/MyDrive/distilbert_email_model"
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.5575
200,0.4744
300,0.4468
400,0.4092
500,0.4537
600,0.3815
700,0.37
800,0.3462
900,0.3479
1000,0.3884


('/content/drive/MyDrive/distilbert_email_model/tokenizer_config.json',
 '/content/drive/MyDrive/distilbert_email_model/special_tokens_map.json',
 '/content/drive/MyDrive/distilbert_email_model/vocab.txt',
 '/content/drive/MyDrive/distilbert_email_model/added_tokens.json',
 '/content/drive/MyDrive/distilbert_email_model/tokenizer.json')

In [15]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import torch

MODEL_DIR = "/content/drive/MyDrive/distilbert_email_model"

print(os.path.exists(MODEL_DIR))

from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

#MODEL_DIR = "/content/drive/MyDrive/distilbert_email_model"

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_DIR)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR)

# -----------------------------
# 7. PREDICT ON NEW EXCEL
# -----------------------------

new_df = pd.read_excel("/content/drive/MyDrive/sample_input_emails.xlsx")
new_df["email_text"] = new_df["email_text"].astype(str)

new_encodings = tokenizer(
    new_df["email_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=1)

id2label = {
    0: "Negative",
    1: "Neutral",
    2: "Positive"
}

new_df["predicted_sentiment"] = [
    id2label[int(i)] for i in predictions.numpy()
]
# -----------------------------
# 8. SAVE OUTPUT
# -----------------------------
new_df.to_excel("/content/drive/MyDrive/bert_predicted_sentiments.xlsx", index=False)

print("✅ DistilBERT prediction complete. Output saved.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
True
✅ DistilBERT prediction complete. Output saved.
