<a href="https://colab.research.google.com/github/Avniiii2606/Email-Classification-using-BERT-LDA/blob/main/Untitled13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pwd
!ls /content/drive/MyDrive/spam_email_dataset.csv

/content
/content/drive/MyDrive/spam_email_dataset.csv


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm

class SpamClassifier(nn.Module):
    def __init__(self, bert_model="bert-base-uncased", num_classes=5):
        super(SpamClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)

        # Neural network layers after BERT
        self.classifier = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Dropout(0.3),  # Increased dropout for better regularization
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def prepare_data(df):
    """Prepare and preprocess the email data"""
    # Clean the text data
    df['text'] = df['text'].str.lower()
    df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True)
    df['text'] = df['text'].str.strip()

    # Convert categories to numerical labels
    le = LabelEncoder()
    df['label'] = le.fit_transform(df['label'])

    return df, le.classes_

def train_model(model, train_loader, val_loader, num_epochs=5, learning_rate=2e-5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1)

    best_val_loss = float('inf')
    best_model_state = None

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_loss = 0
        train_correct = 0
        train_total = 0

        train_bar = tqdm(train_loader, desc=f'Training Epoch {epoch+1}/{num_epochs}')
        for batch in train_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

            # Update progress bar
            train_bar.set_postfix({'loss': loss.item(),
                                 'accuracy': 100. * train_correct / train_total})

        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            val_bar = tqdm(val_loader, desc='Validation')
            for batch in val_bar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

                val_bar.set_postfix({'loss': loss.item(),
                                   'accuracy': 100. * val_correct / val_total})

        avg_val_loss = val_loss / len(val_loader)
        scheduler.step(avg_val_loss)

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()

        print(f'\nEpoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Training Accuracy: {100 * train_correct/train_total:.2f}%')
        print(f'Validation Loss: {avg_val_loss:.4f}')
        print(f'Validation Accuracy: {100 * val_correct/val_total:.2f}%\n')

    # Load best model
    model.load_state_dict(best_model_state)
    return model

def predict_email(model, tokenizer, text, device, label_classes):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probabilities = torch.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs, 1)

    return {
        'predicted_class': label_classes[predicted.item()],
        'confidence': probabilities[0][predicted.item()].item()
    }

def main():
    # Load the dataset
    df_main = pd.read_csv('/content/drive/MyDrive/spam_email_dataset.csv')  # Replace with your dataset path
    df=df_main.head(200)

    # Prepare data
    df, label_classes = prepare_data(df)

    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].values,
        df['label'].values,
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = SpamClassifier(num_classes=len(label_classes))

    # Create datasets and dataloaders
    train_dataset = EmailDataset(X_train, y_train, tokenizer)
    val_dataset = EmailDataset(X_val, y_val, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Train the model
    model = train_model(model, train_loader, val_loader)

    # Save the trained model
    torch.save({
        'model_state_dict': model.state_dict(),
        'label_classes': label_classes
    }, 'spam_classifier.pth')

    # Example prediction
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    sample_email = "Get rich quick! Buy now!"
    result = predict_email(model, tokenizer, sample_email, device, label_classes)
    print(f"Predicted category: {result['predicted_class']}")
    print(f"Confidence: {result['confidence']:.2f}")

if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try u

Using device: cpu


Training Epoch 1/5: 100%|██████████| 10/10 [09:07<00:00, 54.77s/it, loss=0.673, accuracy=45.6]
Validation: 100%|██████████| 3/3 [00:36<00:00, 12.25s/it, loss=0.681, accuracy=70]



Epoch 1/5:
Training Loss: 0.7048
Training Accuracy: 45.62%
Validation Loss: 0.6826
Validation Accuracy: 70.00%



Training Epoch 2/5: 100%|██████████| 10/10 [08:49<00:00, 52.93s/it, loss=0.638, accuracy=71.2]
Validation: 100%|██████████| 3/3 [00:36<00:00, 12.26s/it, loss=0.634, accuracy=75]



Epoch 2/5:
Training Loss: 0.6650
Training Accuracy: 71.25%
Validation Loss: 0.6366
Validation Accuracy: 75.00%



Training Epoch 3/5: 100%|██████████| 10/10 [08:48<00:00, 52.90s/it, loss=0.568, accuracy=74.4]
Validation: 100%|██████████| 3/3 [00:37<00:00, 12.45s/it, loss=0.561, accuracy=75]



Epoch 3/5:
Training Loss: 0.6076
Training Accuracy: 74.38%
Validation Loss: 0.5738
Validation Accuracy: 75.00%



Training Epoch 4/5: 100%|██████████| 10/10 [08:53<00:00, 53.34s/it, loss=0.519, accuracy=85]
Validation: 100%|██████████| 3/3 [00:36<00:00, 12.27s/it, loss=0.487, accuracy=80]



Epoch 4/5:
Training Loss: 0.5460
Training Accuracy: 85.00%
Validation Loss: 0.5128
Validation Accuracy: 80.00%



Training Epoch 5/5: 100%|██████████| 10/10 [08:50<00:00, 53.04s/it, loss=0.495, accuracy=92.5]
Validation: 100%|██████████| 3/3 [00:38<00:00, 12.86s/it, loss=0.428, accuracy=95]



Epoch 5/5:
Training Loss: 0.4788
Training Accuracy: 92.50%
Validation Loss: 0.4575
Validation Accuracy: 95.00%

Predicted category: spam
Confidence: 0.57
