In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load dataset
df = pd.read_csv(r"D:\FOT\sem4\fda\project\spam_detection\dataset\processed\dataset.csv", encoding='latin-1')
df.columns = ['text', 'label']
df['text'] = df['text'].astype(str)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# Load tokenizer and model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize data
max_length = 128
texts = df['text'].tolist()
labels = df['label'].tolist()

# Simple tokenization function
encoded_texts = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

# Split data
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Tokenize split data
train_encodings = tokenizer(X_train_texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
test_encodings = tokenizer(X_test_texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

# Create PyTorch datasets
train_dataset = TensorDataset(
    train_encodings['input_ids'], 
    train_encodings['attention_mask'],
    torch.tensor(y_train)
)
test_dataset = TensorDataset(
    test_encodings['input_ids'], 
    test_encodings['attention_mask'],
    torch.tensor(y_test)
)

# Create dataloaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Setup training parameters
device = torch.device('cpu')  # Use CPU only
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop with live graph
num_epochs = 1
steps_per_epoch = len(train_loader)
total_steps = num_epochs * steps_per_epoch

# Create scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Initialize variables for tracking metrics
train_losses = []
f1_scores = []
steps = []

# Setup live plotting
plt.ion()
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Training Progress')
ax.set_xlabel('Steps')
ax.set_ylabel('Metrics')
loss_line, = ax.plot([], [], 'r-', label='Training Loss')
f1_line, = ax.plot([], [], 'b-', label='F1 Score')
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show(block=False)

# Training and evaluation
print("Starting training...")
global_step = 0
eval_frequency = max(steps_per_epoch // 10, 1)  # Evaluate every 10% of an epoch

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch_idx, batch in enumerate(progress_bar):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        scheduler.step()
        
        # Update progress bar
        progress_bar.set_postfix({"loss": loss.item()})
        
        # Record metrics
        train_losses.append(loss.item())
        steps.append(global_step)
        
        # Evaluate occasionally
        if global_step % eval_frequency == 0:
            model.eval()
            all_preds = []
            all_labels = []
            
            with torch.no_grad():
                for eval_batch in test_loader:
                    eval_input_ids, eval_attention_mask, eval_labels = [b.to(device) for b in eval_batch]
                    outputs = model(input_ids=eval_input_ids, attention_mask=eval_attention_mask)
                    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                    all_preds.extend(preds)
                    all_labels.extend(eval_labels.cpu().numpy())
            
            # Calculate F1 score
            current_f1 = f1_score(all_labels, all_preds, average='macro')
            f1_scores.append(current_f1)
            
            # Print metrics
            print(f"\nStep {global_step}: Loss = {loss.item():.4f}, F1 Score = {current_f1:.4f}")
            
            # Update plot
            loss_line.set_data(steps, train_losses)
            f1_line.set_data(steps[::eval_frequency], f1_scores)
            ax.relim()
            ax.autoscale_view()
            fig.canvas.draw()
            fig.canvas.flush_events()
            
            model.train()
        
        global_step += 1

# Final evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

# Calculate final metrics
final_f1 = f1_score(all_labels, all_preds, average='macro')
print(f"\nFinal F1 Score: {final_f1:.4f}")
print("\nClassification Report:")
print(classification_report(all_labels, all_preds))

# Save plot
plt.ioff()
plt.figure(figsize=(10, 6))
plt.plot(steps, train_losses, 'r-', label='Training Loss')
plt.plot([steps[i] for i in range(0, len(steps), eval_frequency)], f1_scores, 'b-', label='F1 Score')
plt.title('Training Progress')
plt.xlabel('Steps')
plt.ylabel('Metrics')
plt.legend()
plt.grid(True)
plt.savefig('training_progress.png')
plt.show()

# Plot confusion matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Ham', 'Spam'], 
            yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

# Save model
torch.save(model.state_dict(), "roberta_spam_model.pt")
print("Model saved as 'roberta_spam_model.pt'")

# Example prediction function
def predict_text(texts):
    model.eval()
    encoded_inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    input_ids = encoded_inputs['input_ids'].to(device)
    attention_mask = encoded_inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    return preds

# Example predictions
test_messages = [
    "Congratulations! You've won a free iPhone! Click here to claim your prize now!",
    "Hi, can you please send me the meeting notes from yesterday? Thanks!"
]

predictions = predict_text(test_messages)
for msg, pred in zip(test_messages, predictions):
    print(f"Message: {msg[:50]}{'...' if len(msg) > 50 else ''}")
    print(f"Prediction: {'Spam' if pred == 1 else 'Ham'}\n")