In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

email_data = pd.read_csv('/content/drive/MyDrive/FYPDataset/email_batch_3.csv')

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers datasets torch scikit-learn pandas tqdm

In [None]:
!pip install datasets

In [None]:
import os

print("Model Exists:", os.path.exists("bert_email_classifier"))
print("Files in Model Folder:", os.listdir("bert_email_classifier") if os.path.exists("bert_email_classifier") else "No model found")


In [None]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

In [None]:
!pip install datasets

#Machine Learning

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset, DatasetDict
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer

from google.colab import drive
drive.mount('/content/drive')
# -------------------------
# 1. Prepare Your Labeled DataFrame
# -------------------------
labeled_path = "/content/drive/MyDrive/FYPDataset/email_batch_1_labeled.csv"
labeled_df = pd.read_csv(labeled_path)

# Assume labeled_df has columns "Combined_Text" and "Predicted_Category"
# Create a mapping from textual labels in "Predicted_Category" to numeric IDs
labels = labeled_df["Predicted_Category"].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Map textual labels to numeric labels
labeled_df["Predicted_Category"] = labeled_df["Predicted_Category"].map(label2id)

# Rename the label column to "labels" for Trainer compatibility
labeled_df = labeled_df.rename(columns={"Predicted_Category": "labels"})

# Split the data into training and testing sets
train_df, test_df = train_test_split(labeled_df, test_size=0.2, random_state=42)

# -------------------------
# 2. Compute Class Weights
# -------------------------
# Compute class weights using sklearn
class_weights = compute_class_weight('balanced', classes=np.unique(train_df["labels"]), y=train_df["labels"])
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class weights:", class_weights)

# -------------------------
# 3. Convert DataFrames to Datasets
# -------------------------
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

# -------------------------
# 4. Tokenize the Labeled Dataset
# -------------------------
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["Combined_Text"], padding="max_length", truncation=True, max_length=512)

dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns; keep only "input_ids", "attention_mask", and "labels"
columns_to_remove = [col for col in dataset["train"].column_names if col not in ["input_ids", "attention_mask", "labels"]]
dataset = dataset.remove_columns(columns_to_remove)
dataset.set_format("torch")

# -------------------------
# 5. Load Pretrained Model for Sequence Classification
# -------------------------
num_labels = len(label2id)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
model.config.id2label = id2label
model.config.label2id = label2id

# -------------------------
# 6. Define a Metrics Function for Evaluation
# -------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# -------------------------
# 7. Create a Custom Trainer with Weighted Loss
# -------------------------
from transformers import Trainer

class WeightedLossTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# -------------------------
# 8. Set Up Training Arguments
# -------------------------
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # (Deprecated: consider using eval_strategy in newer versions)
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to=[]  # disable wandb logging if not needed
)

# Initialize the custom trainer with class weights
trainer = WeightedLossTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -------------------------
# 9. Fine-Tune the Model
# -------------------------
trainer.train()

# -------------------------
# 10. Save the Fine-Tuned Model
# -------------------------
fine_tuned_model_path = "./fine_tuned_email_classifier"
trainer.save_model(fine_tuned_model_path)

# -------------------------
# 11. Inference on the Unlabeled Dataset
# -------------------------
unlabeled_path = "/content/drive/MyDrive/FYPDataset/email_batch_2.csv"
unlabeled_df = pd.read_csv(unlabeled_path)

# Tokenize the unlabeled dataset using the same tokenizer
unlabeled_encodings = tokenizer(
    unlabeled_df["Combined_Text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

# Move the encodings to the same device as the model
device = model.device  # e.g., 'cuda:0'
unlabeled_encodings = {k: v.to(device) for k, v in unlabeled_encodings.items()}

# Run inference
model.eval()
with torch.no_grad():
    outputs = model(**unlabeled_encodings)
    predictions = torch.argmax(outputs.logits, axis=-1).tolist()

# Map numeric predictions back to text labels
predicted_labels = [id2label[pred] for pred in predictions]
unlabeled_df["Predicted_Label"] = predicted_labels

# Save the predictions for the unlabeled dataset
output_path = "/content/drive/MyDrive/FYPDataset/email_batch_2_labeled.csv"
unlabeled_df.to_csv(output_path, index=False)

print("Inference complete. Predicted labels saved to:", output_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Class weights: tensor([0.7890, 1.3652])


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4173,0.468123,0.791,0.79105,0.791101,0.791
2,0.3566,0.453269,0.817,0.815774,0.815619,0.817
3,0.3775,0.483778,0.821,0.821208,0.821458,0.821


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.32 GiB. GPU 0 has a total capacity of 14.74 GiB of which 6.02 GiB is free. Process 2288 has 8.72 GiB memory in use. Of the allocated memory 8.38 GiB is allocated by PyTorch, and 214.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

unlabeled_path = "/content/drive/MyDrive/FYPDataset/email_batch_1.csv"
unlabeled_df = pd.read_csv(unlabeled_path)

# Tokenize the unlabeled dataset using the same tokenizer
unlabeled_encodings = tokenizer(
    unlabeled_df["Combined_Text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

# Create a TensorDataset for the input tensors
input_ids = unlabeled_encodings["input_ids"]
attention_mask = unlabeled_encodings["attention_mask"]
unlabeled_dataset = TensorDataset(input_ids, attention_mask)

# Define a small batch size (adjust as needed)
batch_size = 8
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=batch_size)

# Ensure the model is on the correct device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_predictions = []
model.eval()

with torch.no_grad():
    for batch in unlabeled_loader:
        batch_input_ids, batch_attention_mask = (b.to(device) for b in batch)
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_predictions = torch.argmax(outputs.logits, axis=-1).tolist()
        all_predictions.extend(batch_predictions)

# Map numeric predictions back to text labels
predicted_labels = [id2label[pred] for pred in all_predictions]
unlabeled_df["Predicted_Label"] = predicted_labels

# Save the predictions for the unlabeled dataset
output_path = "/content/drive/MyDrive/FYPDataset/email_batch_1_labeled.csv"
unlabeled_df.to_csv(output_path, index=False)

print("Inference complete. Predicted labels saved to:", output_path)

# Hugging Face - roberta-base

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# ✅ Force GPU usage if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Load Model & Move to GPU
model_name = "roberta-base"
#model = BertForSequenceClassification.from_pretrained(model_name).to("cuda")
#tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Define Categories
categories = [
    "Spam", "Promotion", "Business Communication", "Meeting & Scheduling",
    "General Discussion & Internal Updates", "IT Alerts & System Notifications",
    "Legal & Contractual", "Purely Personal"
]

# ✅ Function to Predict Category (Force GPU Usage)
def classify_email(email_text):
    inputs = tokenizer(email_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)  # Convert to probabilities
    predicted_label = torch.argmax(probabilities, dim=1).item()

    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Probabilities: {probabilities.tolist()}")
    print(f"🔹 Predicted Label Index: {predicted_label} → {categories[predicted_label]}")

    return categories[predicted_label]  # Convert index to category name

# ✅ Apply Classification & Print Scores
email_data["Predicted_Category"] = email_data["Combined_Text"].apply(classify_email)

print("✅ Classification Completed! Results Saved.")

Using device: cpu


model.safetensors:  69%|######9   | 346M/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
email_data['Predicted_Category'].value_counts()

Unnamed: 0_level_0,count
Predicted_Category,Unnamed: 1_level_1
Promotion & Newsletters,2549
Spam,1212
General Discussion & Internal Updates,763
Business Communication,476


In [None]:
random_emails = email_data.sample(n=10)  #10 random emails

for index, row in random_emails.iterrows():
    print("\n📌 **Random Email Selected:**")
    print(f"**From:** {row['From']}")
    print(f"**To:** {row['To']}")
    print(f"**Subject:** {row['Subject']}")
    print(f"**Message:** {row['Message']}")
    print(f"**Email Category:** {row['Predicted_Category']}")
    print("-" * 80)

#GPT - EleutherAI

In [None]:
!huggingface-cli login

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

email_data = pd.read_csv('/content/drive/MyDrive/FYPDataset/email_batch_3.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
email_data

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load GPT-Neo text-generation pipeline
# Using "EleutherAI/gpt-neo-125M" as an example; you can choose a larger model if resources allow
classifier = pipeline("text-generation", model="EleutherAI/gpt-neo-125M", device=0 if device == "cuda" else -1)

# Define candidate categories (your predefined labels)
categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Internal Communication",  # merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication"
]

def classify_email(email_text):
    # Create a prompt instructing GPT-Neo to classify the email.
    prompt = (
    f"Email: {email_text}\n"
    "The email above is part of a corporate communication system. "
    "Based on its content, choose the most suitable category from the list below:\n"
    f"{', '.join(categories)}.\n"
    "Answer with only the category name."
)


    # Generate a response; adjust max_length if necessary
    generated = generator(prompt, max_new_tokens=40, num_return_sequences=1)[0]['generated_text']

    # Convert generated text to lowercase for case-insensitive matching
    generated_lower = generated.lower()
    predicted = None
    for cat in categories:
        if cat.lower() in generated_lower:
            predicted = cat
            break
    # Default to "Internal Communication" if no candidate label is found
    if predicted is None:
        predicted = "Internal Communication"

    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted}")

    return predicted

# Load your dataset (ensure the CSV file exists at the specified path)
email_data = pd.read_csv("/content/drive/MyDrive/FYPDataset/email_batch_3.csv")

# Process the emails with a progress bar
tqdm.pandas(desc="Processing Emails")
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

# Save and download the labeled CSV file
email_data.to_csv("email_batch_3_labeled.csv", index=False)
files.download("email_batch_3_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

# Zero Shot FACEBOOK

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --extra-index-url https://download.pytorch.org/whl/cu118


In [None]:
!nvidia-smi


In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

email_data = pd.read_csv('/content/drive/MyDrive/FYPDataset/email_batch_1.csv')

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if device == "cuda" else -1)

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email(email_text):
    result = classifier(email_text, candidate_labels=categories, multi_label=False)
    predicted_label = result["labels"][0]
    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

tqdm.pandas(desc="Processing Emails")  # Enable progress bar
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

In [None]:
email_data['Predicted_Category'].value_counts()

In [None]:
random_emails = email_data.sample(n=10)  #10 random emails

for index, row in random_emails.iterrows():
    print("\n📌 **Random Email Selected:**")
    print(f"**From:** {row['From']}")
    print(f"**To:** {row['To']}")
    print(f"**Subject:** {row['Subject']}")
    print(f"**Message:** {row['Message']}")
    print(f"**Email Category:** {row['Predicted_Category']}")
    print("-" * 80)

#BERT-based - Not Accurate

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Use BERT-based zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="bert-base-uncased", device=0 if device == "cuda" else -1)

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email(email_text):
    result = classifier(email_text, candidate_labels=categories, multi_label=False)
    predicted_label = result["labels"][0]
    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

tqdm.pandas(desc="Processing Emails")  # Enable progress bar
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Processing Emails:   0%|          | 2/20000 [00:03<9:40:51,  1.74s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 3/20000 [00:06<11:44:56,  2.12s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 4/20000 [00:08<12:51:11,  2.31s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 5/20000 [00:11<13:27:47,  2.42s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 6/20000 [00:13<12:08:42,  2.19s/it]


🔹 Predicted Category: Finance & Transactions


Processing Emails:   0%|          | 7/20000 [00:15<11:43:23,  2.11s/it]


🔹 Predicted Category: Finance & Transactions


Processing Emails:   0%|          | 8/20000 [00:17<12:39:11,  2.28s/it]


📌 **Email:** best company survey enron innovative company ameri...
🔹 Predicted Category: Promotion & Newsletters


Processing Emails:   0%|          | 9/20000 [00:19<12:26:54,  2.24s/it]


📌 **Email:** announcing market virtually untapped like learn co...
🔹 Predicted Category: Spam


Processing Emails:   0%|          | 10/20000 [00:21<12:06:30,  2.18s/it]


📌 **Email:** announcing market virtually untapped like learn co...
🔹 Predicted Category: Spam


Processing Emails:   0%|          | 11/20000 [00:24<12:02:03,  2.17s/it]


📌 **Email:** announcing market virtually untapped like learn co...
🔹 Predicted Category: Spam


Processing Emails:   0%|          | 12/20000 [00:27<13:41:46,  2.47s/it]


📌 **Email:** activity boss nutcracker market invitation nutcrac...
🔹 Predicted Category: Legal & Contractual


Processing Emails:   0%|          | 13/20000 [00:29<13:05:24,  2.36s/it]


📌 **Email:** aggie virus receive aggie virus programming experi...
🔹 Predicted Category: Business Communication


Processing Emails:   0%|          | 14/20000 [00:31<12:04:59,  2.18s/it]


📌 **Email:** aggie virus aggie virus receive programming experi...
🔹 Predicted Category: Business Communication


Processing Emails:   0%|          | 15/20000 [00:32<10:27:19,  1.88s/it]


📌 **Email:** ale document discussion ale documents mr hodge nee...
🔹 Predicted Category: Promotion & Newsletters


Processing Emails:   0%|          | 16/20000 [00:33<9:00:04,  1.62s/it] 


📌 **Email:** american disability actin fifth circuit featured s...
🔹 Predicted Category: Legal & Contractual


Processing Emails:   0%|          | 17/20000 [00:35<9:28:52,  1.71s/it]


📌 **Email:** crack ken write email file transmit elpaso corpora...
🔹 Predicted Category: Personal Communication & Purely Personal


Processing Emails:   0%|          | 18/20000 [00:36<9:29:04,  1.71s/it]


📌 **Email:** art new timedate art thursday november need bring ...
🔹 Predicted Category: Promotion & Newsletters


Processing Emails:   0%|          | 19/20000 [00:39<11:03:25,  1.99s/it]


📌 **Email:** autopathing buysells guy currently system design m...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 20/20000 [00:41<10:33:05,  1.90s/it]


📌 **Email:** baker mckenzie elaw alert newsletter get baker mck...
🔹 Predicted Category: Promotion & Newsletters


Processing Emails:   0%|          | 21/20000 [00:42<10:09:01,  1.83s/it]


📌 **Email:** baker mckenzie elaw alert fyi baker mckenzie law a...
🔹 Predicted Category: Spam


Processing Emails:   0%|          | 22/20000 [00:46<12:34:07,  2.26s/it]


📌 **Email:** best play millennium brilliant idea nice jennifer ...
🔹 Predicted Category: Legal & Contractual


Processing Emails:   0%|          | 23/20000 [00:48<12:35:43,  2.27s/it]


📌 **Email:** best practice meeting tuesday feb plan attend prac...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 24/20000 [00:49<10:27:42,  1.89s/it]


📌 **Email:** brokerage agreement meeting meeting move thursday ...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 25/20000 [00:50<9:27:56,  1.71s/it] 


📌 **Email:** brokerage agreement meeting email send find confer...
🔹 Predicted Category: Meeting & Scheduling


Processing Emails:   0%|          | 26/20000 [00:51<8:40:08,  1.56s/it]


📌 **Email:** business brief quick update election activity come...
🔹 Predicted Category: Promotion & Newsletters


Processing Emails:   0%|          | 27/20000 [00:54<9:27:58,  1.71s/it]


📌 **Email:** chinese wall classroom training chinese wall train...
🔹 Predicted Category: Promotion & Newsletters


Processing Emails:   0%|          | 28/20000 [00:57<11:25:49,  2.06s/it]


📌 **Email:** christmas around world open invitation hello time ...
🔹 Predicted Category: Promotion & Newsletters





KeyboardInterrupt: 

#roberta-large - Not Accurate

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Use RoBERTa-based zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="roberta-large", device=0 if device == "cuda" else -1)

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email(email_text):
    result = classifier(email_text, candidate_labels=categories, multi_label=False)
    predicted_label = result["labels"][0]
    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

tqdm.pandas(desc="Processing Emails")  # Enable progress bar
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")


Using device: cpu


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Processing Emails:   0%|          | 2/20000 [00:26<73:26:29, 13.22s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Personal Communication & Purely Personal


Processing Emails:   0%|          | 3/20000 [00:49<96:01:36, 17.29s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Personal Communication & Purely Personal


Processing Emails:   0%|          | 4/20000 [01:07<98:22:04, 17.71s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Personal Communication & Purely Personal


Processing Emails:   0%|          | 5/20000 [01:26<100:27:56, 18.09s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Personal Communication & Purely Personal


Processing Emails:   0%|          | 6/20000 [01:34<82:07:23, 14.79s/it] 


🔹 Predicted Category: Personal Communication & Purely Personal


Processing Emails:   0%|          | 7/20000 [01:40<65:36:52, 11.81s/it]


🔹 Predicted Category: Personal Communication & Purely Personal


Processing Emails:   0%|          | 7/20000 [01:40<80:06:13, 14.42s/it]


KeyboardInterrupt: 

# distilbert-base-uncased - Very not Accurate

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Use DistilBERT-based zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="distilbert-base-uncased", device=0 if device == "cuda" else -1)

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email(email_text):
    result = classifier(email_text, candidate_labels=categories, multi_label=False)
    predicted_label = result["labels"][0]
    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

tqdm.pandas(desc="Processing Emails")  # Enable progress bar
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

Processing Emails:   0%|          | 55/20000 [01:35<8:27:24,  1.53s/it]


📌 **Email:** datek online execution report dear mr larry campbe...
🔹 Predicted Category: Promotion & Newsletters


Processing Emails:   0%|          | 55/20000 [01:37<9:48:25,  1.77s/it]


KeyboardInterrupt: 

#sentence-transformers/paraphrase-xlm-r-multilingual-v1

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Use Sentence-Transformers-based paraphrase model for sentence similarity (multilingual)
classifier = pipeline("zero-shot-classification", model="sentence-transformers/paraphrase-xlm-r-multilingual-v1", device=0 if device == "cuda" else -1)

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email(email_text):
    result = classifier(email_text, candidate_labels=categories, multi_label=False)
    predicted_label = result["labels"][0]
    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

tqdm.pandas(desc="Processing Emails")  # Enable progress bar
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

Processing Emails:   0%|          | 30/20000 [01:25<15:45:11,  2.84s/it]


📌 **Email:** credit seminar tanya legal group like schedule ins...
🔹 Predicted Category: Spam


Processing Emails:   0%|          | 30/20000 [01:27<16:11:32,  2.92s/it]


KeyboardInterrupt: 

#microsoft/deberta-v3-large

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Use facebook/bart-large-mnli for zero-shot classification
classifier = pipeline("zero-shot-classification", model="microsoft/deberta-v3-large", device=0 if device == "cuda" else -1)

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email(email_text):
    result = classifier(email_text, candidate_labels=categories, multi_label=False)
    predicted_label = result["labels"][0]
    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

tqdm.pandas(desc="Processing Emails")  # Enable progress bar
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

Using device: cpu


config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Device set to use cpu
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.

Processing Emails:   0%|          | 0/20000 [00:00<?, ?it/s][AAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

Processing Emails:   0%|          | 2/20000 [00:43<119:33:58, 21.52s/it][A


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Spam


Processing Emails:   0%|          | 2/20000 [01:04<179:31:15, 32.32s/it]


KeyboardInterrupt: 

#xlnet-large-cased

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Use facebook/bart-large-mnli for zero-shot classification
classifier = pipeline("zero-shot-classification", model="xlnet-large-cased", device=0 if device == "cuda" else -1)

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email(email_text):
    result = classifier(email_text, candidate_labels=categories, multi_label=False)
    predicted_label = result["labels"][0]
    print(f"\n📌 **Email:** {email_text[:50]}...")
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

tqdm.pandas(desc="Processing Emails")  # Enable progress bar
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email)

email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

Using device: cpu


config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Device set to use cpu
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.

Processing Emails:   0%|          | 0/20000 [00:00<?, ?it/s][AAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

Processing Emails:   0%|          | 2/20000 [00:30<85:09:41, 15.33s/it][A


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling



Processing Emails:   0%|          | 3/20000 [00:45<83:00:06, 14.94s/it][A


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling



Processing Emails:   0%|          | 4/20000 [00:58<80:52:31, 14.56s/it][A


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling



Processing Emails:   0%|          | 5/20000 [01:13<81:10:25, 14.61s/it][A


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Meeting & Scheduling



Processing Emails:   0%|          | 6/20000 [01:22<70:39:25, 12.72s/it][A


🔹 Predicted Category: Spam



Processing Emails:   0%|          | 7/20000 [01:30<61:49:27, 11.13s/it][A


🔹 Predicted Category: Spam



Processing Emails:   0%|          | 8/20000 [01:43<65:32:02, 11.80s/it][A


📌 **Email:** best company survey enron innovative company ameri...
🔹 Predicted Category: Finance & Transactions



Processing Emails:   0%|          | 9/20000 [01:57<68:59:26, 12.42s/it][A


📌 **Email:** announcing market virtually untapped like learn co...
🔹 Predicted Category: Business Communication


Processing Emails:   0%|          | 9/20000 [02:01<75:14:03, 13.55s/it]


KeyboardInterrupt: 

#LLama - Need license

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the LLaMA model and tokenizer
model_name = "meta-llama/Llama-3.1-8B"  # Ensure the LLaMA model is accessible
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define your categories
categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email_with_llama(email_text):
    # Construct the prompt for LLaMA
    prompt = f"Classify the following email into one of these categories: {', '.join(categories)}. Email: {email_text}"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate the response (adjust max_length as needed)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_return_sequences=1)

    # Decode and process the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the predicted category from the output (assuming the model generates the category at the end of the prompt)
    predicted_category = output_text.split('Category:')[-1].strip()

    # Print the email and predicted category
    print(f"\n📌 **Email:** {email_text[:50]}...")  # Print first 50 characters of the email
    print(f"🔹 Predicted Category: {predicted_category}")

    return predicted_category

# Example usage
email_data = pd.read_csv("path_to_your_email_data.csv")  # Make sure to load your email data

# Process emails with a progress bar
tqdm.pandas(desc="Processing Emails")
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email_with_llama)

# Save the labeled data to a CSV file
email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

Using device: cpu


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-8B.
401 Client Error. (Request ID: Root=1-67d43f58-635588620a1dc91373c41454;2fe4d50b-be21-43aa-8316-01df0e075625)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B is restricted. You must have access to it and be authenticated to access it. Please log in.

#tiiuae/falcon-180B

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load Falcon-180B model and tokenizer from Hugging Face Model Hub
model_name = "tiiuae/falcon-180B"  # Replace with the correct Hugging Face model name if different
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define categories
categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email_with_falcon(email_text):
    # Construct the prompt for Falcon-180B
    prompt = f"Classify the following email into one of these categories: {', '.join(categories)}. Email: {email_text}"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate the response (adjust max_length as needed)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_return_sequences=1)

    # Decode and process the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the predicted category (assuming the model generates the category at the end of the prompt)
    predicted_category = output_text.split('Category:')[-1].strip()

    # Print the email and predicted category
    print(f"\n📌 **Email:** {email_text[:50]}...")  # Print first 50 characters of the email
    print(f"🔹 Predicted Category: {predicted_category}")

    return predicted_category

# Example usage
email_data = pd.read_csv("path_to_your_email_data.csv")  # Load your email data here

# Process emails with a progress bar
tqdm.pandas(desc="Processing Emails")
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email_with_falcon)

# Save the labeled data to a CSV file
email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

#Bloom

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load BLOOM model and tokenizer from Hugging Face Model Hub
model_name = "bigscience/bloom-560m"  # You can replace with the larger model like bloom-1b1, bloom-3b, etc.
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define categories
categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email_with_bloom(email_text):
    # Construct the prompt for BLOOM
    prompt = f"Classify the following email into one of these categories: {', '.join(categories)}. Email: {email_text}"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate the response (adjust max_length as needed)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_return_sequences=1)

    # Decode and process the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the predicted category from the output (assuming the model generates the category at the end of the prompt)
    predicted_category = output_text.split('Category:')[-1].strip()

    # Print the email and predicted category
    print(f"\n📌 **Email:** {email_text[:50]}...")  # Print first 50 characters of the email
    print(f"🔹 Predicted Category: {predicted_category}")

    return predicted_category

# Process emails with a progress bar
tqdm.pandas(desc="Processing Emails")
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email_with_bloom)

# Save the labeled data to a CSV file
email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

Using device: cpu


Processing Emails:   0%|          | 2/20000 [01:59<331:19:37, 59.64s/it]


📌 **Email:** oati etag minimum requirement time run short compa...
🔹 Predicted Category: Classify the following email into one of these categories: Spam, Promotion & Newsletters, Finance & Transactions, Business Communication, Meeting & Scheduling, IT Alerts & System Notifications, Legal & Contractual, Personal Communication & Purely Personal. Email: oati etag minimum requirement time run short company prepare etag minimum required step complete prior march entity update nerc registry tp designate por pod valid tag pse associate scheduling inquire detail oati user function digital certificates certificate access internally determine supervisor contact begin process acquire document send failure result partial lack capability additional information critical etagging issue visit page frank billington manager customer services customer support customer service customer support customer support customer support customer support customer support customer support customer support customer s

Processing Emails:   0%|          | 2/20000 [03:39<609:02:04, 109.64s/it]


KeyboardInterrupt: 

#Qwen

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Force GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load Qwen model and tokenizer from Hugging Face Model Hub (or other platform)
model_name = "qwen/qwen-large"  # Use the actual model name for Qwen
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define categories for classification
categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",  # Merged category for formal business communication & general internal updates.
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email_with_qwen(email_text):
    # Construct the prompt for Qwen
    prompt = f"Classify the following email into one of these categories: {', '.join(categories)}. Email: {email_text}"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate the response (adjust max_length as needed)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_return_sequences=1)

    # Decode and process the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the predicted category from the output (assuming the model generates the category at the end of the prompt)
    predicted_category = output_text.split('Category:')[-1].strip()

    # Print the email and predicted category
    print(f"\n📌 **Email:** {email_text[:50]}...")  # Print first 50 characters of the email
    print(f"🔹 Predicted Category: {predicted_category}")

    return predicted_category

# Process emails with a progress bar
tqdm.pandas(desc="Processing Emails")
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email_with_qwen)

# Save the labeled data to a CSV file
email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

#GPT2

In [None]:
from transformers import pipeline
from tqdm import tqdm
from google.colab import files
import torch
import pandas as pd

# Load DistilGPT-2 for text generation or zero-shot classification
classifier = pipeline("zero-shot-classification", model="distilgpt2")

categories = [
    "Spam",
    "Promotion & Newsletters",
    "Finance & Transactions",
    "Business Communication",
    "Meeting & Scheduling",
    "IT Alerts & System Notifications",
    "Legal & Contractual",
    "Personal Communication & Purely Personal"
]

def classify_email_with_distilgpt2(email_text):
    result = classifier(email_text, candidate_labels=categories)
    predicted_label = result["labels"][0]
    print(f"📌 **Email:** {email_text[:50]}...")  # Print first 50 characters of the email
    print(f"🔹 Predicted Category: {predicted_label}")
    return predicted_label

# Process emails with a progress bar
tqdm.pandas(desc="Processing Emails")
email_data["Predicted_Category"] = email_data["Combined_Text"].progress_apply(classify_email_with_distilgpt2)

# Save the labeled data to a CSV file
email_data.to_csv("email_batch_1_labeled.csv", index=False)
files.download("email_batch_1_labeled.csv")

print(f"✅ Classification Completed! Total emails processed: {len(email_data)}")

#Ollama

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

email_data = pd.read_csv('/content/drive/MyDrive/FYPDataset/email_batch_1.csv')

Mounted at /content/drive


In [None]:
!pip install tqdm pandas requests



In [None]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.4.7-py3-none-any.whl.metadata (4.7 kB)
Downloading ollama-0.4.7-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.4.7


In [None]:
import requests

def label_email_llama3(email_text):
    prompt = f"""Read the email below and classify it into exactly one of the following categories.
    Return only the category name without any explanation.

    Categories:
    - Spam
    - Promotion and Newsletter
    - Business Communication
    - IT Alerts & System Notifications
    - Personal Communication & Purely Personal
    - Legal & Contractual
    - Finance & Transactions
    - Meeting & Scheduling

    Email Content: "{email_text}"

    Category:
    """

    try:
        response = requests.post(
            'http://localhost:11434/api/generate',
            json={
                'model': 'llama3.2',
                'prompt': prompt,
                'stream': False,
                'temperature': 0.1,
                'max_tokens': 5
            }
        ).json()

        print("Raw API Response:", response)

        if 'response' in response:
            return response['response'].strip()
        else:
            return f"Error: 'response' key not found. Full response: {response}"

    except Exception as e:
        return f"Request failed: {e}"

test_email = "curve validation report repository all curve validation folder repository setup drive include follow structure bold name item subfolder reporting consolidated ees egm financial enron americas gas power canada broadband services global assets markets excl industrial procedures template procedure rollout document request access base team exhibit ain mail week hopefully mid go forward additional personnel group need fill come tome therm clear right let know question kc"
print(label_email_llama3(test_email))

Raw API Response: {'model': 'llama3.2', 'created_at': '2025-03-15T03:43:16.065550421Z', 'response': 'Spam', 'done': True, 'done_reason': 'stop', 'context': [128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 271, 128009, 128006, 882, 128007, 271, 4518, 279, 2613, 3770, 323, 49229, 433, 1139, 7041, 832, 315, 279, 2768, 11306, 13, 720, 262, 3494, 1193, 279, 5699, 836, 2085, 904, 16540, 382, 262, 29312, 512, 262, 482, 82767, 198, 262, 482, 57204, 323, 39693, 198, 262, 482, 8184, 31966, 198, 262, 482, 8871, 69408, 612, 744, 54038, 198, 262, 482, 19758, 31966, 612, 30688, 398, 19758, 198, 262, 482, 25705, 612, 19735, 940, 198, 262, 482, 23261, 612, 56385, 198, 262, 482, 30155, 612, 328, 45456, 271, 262, 8463, 9059, 25, 330, 51151, 10741, 1934, 12827, 682, 16029, 10741, 8695, 12827, 6642, 6678, 2997, 1833, 6070, 14265, 836, 1537, 1207, 18135, 13122, 60391, 384, 288, 8866, 76, 6020, 665, 2298, 66879, 300, 6962, 2410, 32863, 41925, 3600, 3728, 12032, 11987, 81384, 13

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import requests
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# 📌 Step 3: Load the Dataset from Google Drive
file_path = "/content/drive/MyDrive/FYPDataset/email_batch_1.csv"  # Adjust path if needed
email_data = pd.read_csv(file_path)

# 📌 Step 4: Define the Llama 3.2 API Function
def label_email_llama3(email_text):
    """Calls Llama 3.2 API and prints email sample with category"""
    prompt = f"""Read the email below and classify it into exactly one of the following categories.
    Return only the category name without any explanation.

    Categories:
    - Spam
    - Promotion and Newsletter
    - Business Communication
    - IT Alerts & System Notifications
    - Personal Communication & Purely Personal
    - Legal & Contractual
    - Finance & Transactions
    - Meeting & Scheduling

    Email Content: "{email_text}"

    Category:
    """

    try:
        response = requests.post(
            "http://localhost:11434/api/generate",  # Directly use localhost
            json={
                'model': 'llama3.2',
                'prompt': prompt,
                'stream': False,
                'temperature': 0.1,
                'max_tokens': 5
            },
            timeout=10
        ).json()

        predicted_label = response.get('response', 'Error').strip()

        # 📌 Show Progress in Colab Console
        print(f"📌 **Email:** {email_text[:50]}...")  # Print first 50 characters of the email
        print(f"🔹 Predicted Category: {predicted_label}\n")

        return predicted_label

    except requests.exceptions.RequestException:
        return "Error"

# 📌 Step 5: Apply Parallel Processing for Faster Execution
def process_batch(batch):
    return batch["Message"].progress_apply(label_email_llama3)  # tqdm progress bar

# 📌 Step 6: Run in Batches (Optimized for Colab)
BATCH_SIZE = 500
NUM_WORKERS = 3

tqdm.pandas(desc="🚀 Processing Emails")

with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
    results = list(tqdm(executor.map(process_batch,
                                     [email_data.iloc[i:i + BATCH_SIZE] for i in range(0, len(email_data), BATCH_SIZE)]),
                        total=len(email_data) // BATCH_SIZE))

# 📌 Step 7: Store Results in DataFrame
email_data["Ollama_Category"] = [category for batch in results for category in batch]

# 📌 Step 8: Save to Google Drive
output_file = "/content/drive/MyDrive/FYPDataset/email_batch_1_labeled.csv"
email_data.to_csv(output_file, index=False)

# 📊 Show category distribution
print(email_data["Ollama_Category"].value_counts())

print(f"✅ Processed file saved at: {output_file}")

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

email_data = pd.read_csv('/content/drive/MyDrive/FYPDataset/email_batch_1_labeled.csv')

Mounted at /content/drive


In [None]:
email_data['Ollama_Category'].value_counts()

#Qwen2.5 Model

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
!ollama serve

Couldn't find '/root/.ollama/id_ed25519'. Generating new private key.
Your new public key is: 

ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIO+w4YmeXgsyr3iDMzMdARox+zOK23u7RfD+SfKHRx/T

2025/03/17 01:45:21 routes.go:1230: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:2048 OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://

In [None]:
!ollama pull qwen2.5

In [None]:
!ollama run qwen2.5

[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠇ [K[?25h[?2026l[?2026h[?25l[1G⠇ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠋ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠏ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l

In [None]:
!OLLAMA_BACKEND=cuda ollama run qwen2.5

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h[?2004h>>> [38;5;245mSend a message (/? for help)[28D[0m[KHi
[?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠼ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hHello[?25l[?25h![?25l[?25h How[?25l[?25h can[?25l[?25h I[?25l[?25h assist[?25l[?25h you[?25l[?25h today[?25l[?25h?[?25l[?25h

[?25l[?25h>>> [38;5;245mSend a message (/? for help)[28D[0m^C


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import requests
import pandas as pd
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

file_path = "/content/drive/MyDrive/FYPDataset/email_batch_9.csv"
email_data = pd.read_csv(file_path)

if "Message" not in email_data.columns:
    raise ValueError("🚨 ERROR: 'Message' column not found in the dataset. Check the CSV file.")

def label_email_qwen2_5(email_text, progress_bar):
    """Calls Qwen2.5 API and forces classification."""
    prompt = f"""You are analyzing emails specifically from the **Enron Corporation**.
    The following email belongs to Enron's historical email dataset.

    Your task is to classify it into exactly one of the following categories.
    **You must choose only one category from the list below** and return only the category name without explanation.
    If uncertain, choose the closest category.

    **Categories (Choose exactly one):**
    - Spam
    - Promotion & Newsletter
    - General Business Communication
    - Internal Policies & HR Updates
    - Meeting & Scheduling
    - Project Management & Strategy
    - Mergers, Partnerships & Alliances
    - IT Alerts & System Notifications
    - Personal Communication & Purely Personal
    - Legal & Contractual
    - Finance & Transactions

    **Email Content (from Enron):**
    "{email_text}"

    **Category (return only one from the list above, nothing else):**
    """

    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                'model': 'qwen2.5',
                'prompt': prompt,
                'stream': False,
                'temperature': 0.1,
                'max_tokens': 10
            },
            timeout=15
        ).json()

        predicted_label = response.get('response', 'Error').strip()

        valid_categories = [
            "Spam", "Promotion & Newsletter", "General Business Communication", "Internal Policies & HR Updates",
            "Meeting & Scheduling", "Project Management & Strategy", "Mergers, Partnerships & Alliances",
            "IT Alerts & System Notifications", "Personal Communication & Purely Personal",
            "Legal & Contractual", "Finance & Transactions"
        ]
        if predicted_label not in valid_categories:
            predicted_label = "General Business Communication"

        progress_bar.update(1)

        return predicted_label

    except requests.exceptions.RequestException:
        progress_bar.update(1)
        return "Error"

def process_batch(batch, progress_bar):
    return batch["Message"].apply(lambda email: label_email_qwen2_5(email, progress_bar))

BATCH_SIZE = 500
NUM_WORKERS = 3

with tqdm(total=len(email_data), desc="🚀 Processing Emails") as progress_bar:
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        results = list(executor.map(
            lambda batch: process_batch(batch, progress_bar),
            [email_data.iloc[i:i + BATCH_SIZE] for i in range(0, len(email_data), BATCH_SIZE)]
        ))

email_data["Qwen2.5_Category"] = [category for batch in results for category in batch]

output_file = "/content/drive/MyDrive/FYPDataset/email_batch_9_labeled_qwen2.5.csv"
email_data.to_csv(output_file, index=False)

print(email_data["Qwen2.5_Category"].value_counts())

print(f"✅ Processed {len(email_data)} emails.")
print(f"✅ Processed file saved at: {output_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


🚀 Processing Emails: 100%|██████████| 16477/16477 [1:05:23<00:00,  4.20it/s]


Qwen2.5_Category
General Business Communication              4959
Personal Communication & Purely Personal    4225
Finance & Transactions                      3067
Meeting & Scheduling                        1493
Legal & Contractual                          867
Promotion & Newsletter                       749
IT Alerts & System Notifications             362
Project Management & Strategy                340
Spam                                         264
Internal Policies & HR Updates               105
Mergers, Partnerships & Alliances             46
Name: count, dtype: int64
✅ Processed 16477 emails.
✅ Processed file saved at: /content/drive/MyDrive/FYPDataset/email_batch_9_labeled_qwen2.5.csv


In [None]:
print(email_data["Qwen2.5_Category"].value_counts())

Qwen2.5_Category
General Business Communication              6668
Personal Communication & Purely Personal    5320
Finance & Transactions                      3075
Meeting & Scheduling                        2373
Legal & Contractual                         1557
Project Management & Strategy                362
Promotion & Newsletter                       276
Internal Policies & HR Updates               124
IT Alerts & System Notifications             118
Mergers, Partnerships & Alliances             65
Spam                                          62
Name: count, dtype: int64
