<a href="https://colab.research.google.com/github/Avniiii2606/Email-Classification-using-BERT-LDA/blob/main/Untitled14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

!pwd
!ls /content/drive/MyDrive/spam_email_dataset.csv

file_path="/content/drive/MyDrive/spam_email_dataset.csv"

Mounted at /content/drive
/content
/content/drive/MyDrive/spam_email_dataset.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [None]:
def prepare_data(file_path):
    # Read dataset
    df_main = pd.read_csv(file_path)
    df=df_main.head(200)

    # Encode labels
    label_encoder = LabelEncoder()
    df['encoded_category'] = label_encoder.fit_transform(df['label'])

    return df, label_encoder

In [None]:
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)


In [None]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)

        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += len(labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(data_loader), correct_predictions.double() / total_predictions

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = nn.CrossEntropyLoss()(outputs, labels)

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += len(labels)
            total_loss += loss.item()

    return total_loss / len(data_loader), correct_predictions.double() / total_predictions


In [None]:
def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load and prepare data
    df, label_encoder = prepare_data(file_path)


    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['text'].values,
        df['encoded_category'].values,
        test_size=0.2,
        random_state=42
    )

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    # Create datasets
    train_dataset = EmailDataset(train_texts, train_labels, tokenizer)
    val_dataset = EmailDataset(val_texts, val_labels, tokenizer)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Initialize model
    model = BERTClassifier(num_classes=len(label_encoder.classes_))
    model = model.to(device)

    # Initialize optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # Training loop
    epochs = 3
    best_accuracy = 0

    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')

        # Train
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
        print(f'Training Loss: {train_loss:.3f}')
        print(f'Training Accuracy: {train_acc:.3f}')

        # Evaluate
        val_loss, val_acc = evaluate(model, val_loader, device)
        print(f'Validation Loss: {val_loss:.3f}')
        print(f'Validation Accuracy: {val_acc:.3f}')

        if val_acc > best_accuracy:
            best_accuracy = val_acc
            torch.save(model.state_dict(), 'best_model.pt')

    print(f'\nBest validation accuracy: {best_accuracy:.3f}')


In [None]:
def predict_email(text, model, tokenizer, label_encoder, device):
    model.eval()
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs, dim=1)

    return label_encoder.inverse_transform([prediction.item()])[0]

if __name__ == "__main__":
    main()

Using device: cpu


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['encoded_category'] = label_encoder.fit_transform(df['label'])



Epoch 1/3


Training: 100%|██████████| 10/10 [04:00<00:00, 24.09s/it]


Training Loss: 0.657
Training Accuracy: 0.594


Evaluating: 100%|██████████| 3/3 [00:17<00:00,  5.74s/it]


Validation Loss: 0.640
Validation Accuracy: 0.625

Epoch 2/3


Training: 100%|██████████| 10/10 [03:47<00:00, 22.75s/it]


Training Loss: 0.462
Training Accuracy: 0.775


Evaluating: 100%|██████████| 3/3 [00:17<00:00,  5.72s/it]


Validation Loss: 0.500
Validation Accuracy: 0.625

Epoch 3/3


Training: 100%|██████████| 10/10 [03:42<00:00, 22.29s/it]


Training Loss: 0.321
Training Accuracy: 0.844


Evaluating: 100%|██████████| 3/3 [00:18<00:00,  6.02s/it]


Validation Loss: 0.335
Validation Accuracy: 0.925

Best validation accuracy: 0.925
