In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import re
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import ReduceLROnPlateau
# ------------------------------
# Focal Loss Class
# ------------------------------
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        """
        Args:
            alpha (float): Weighting factor for the rare class.
            gamma (float): Focusing parameter to down-weight easy examples.
            reduction (str): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        """
        Args:
            inputs (Tensor): Logits tensor of shape (N, C), where C is the number of classes.
            targets (Tensor): Ground truth labels of shape (N,).
        Returns:
            Tensor: Loss value.
        """
        # Compute the standard cross entropy loss without reduction
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        # Compute the probability of the true class
        pt = torch.exp(-ce_loss)
        # Compute the focal loss
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss



# -----------------------------
# 1. Text Preprocessing Classes
# -----------------------------
class TextPreprocessor:
    def __init__(self, max_vocab_size=15000, max_seq_length=128):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_counts = Counter()
        
    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        text = ' '.join(text.split())
        return text
        
    def fit(self, texts):
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            self.word_counts.update(words)
        
        # Reserve two indices for PAD and UNK tokens
        vocab_words = [word for word, count in self.word_counts.most_common(self.max_vocab_size - 2)]
        for word in vocab_words:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def transform(self, texts):
        sequences = []
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            # Truncate or pad sequences
            seq = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words[:self.max_seq_length]]
            seq = seq + [self.word2idx['<PAD>']] * (self.max_seq_length - len(seq))
            sequences.append(seq)
        return torch.tensor(sequences)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# -----------------------------
# 2. Define the Model
# -----------------------------
class ImprovedTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
        super(ImprovedTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.7) # increased rate
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True,
                            dropout=0.2 if num_layers > 1 else 0)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(0.5) # increased rate
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(0.5) # increased rate
        self.fc3 = nn.Linear(hidden_dim // 2, num_classes)
        
    def attention_net(self, lstm_output):
        # Compute attention weights and context vector
        attn_scores = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(attn_scores, dim=1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        return context_vector
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention_net(lstm_out)
        x = self.fc1(attn_out)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# -----------------------------
# 3. Function to Load Pretrained Embeddings (GloVe)
# -----------------------------
def load_pretrained_embeddings(embedding_path, word2idx, embed_dim):
    print("Loading pretrained embeddings...")
    # Initialize embeddings with a uniform distribution
    embeddings = np.random.uniform(-0.05, 0.05, (len(word2idx), embed_dim))
    embeddings[word2idx['<PAD>']] = np.zeros(embed_dim)
    found = 0
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] != embed_dim:
                continue  # Skip if dimensions mismatch
            if word in word2idx:
                embeddings[word2idx[word]] = vector
                found += 1
    print(f"Found {found} pretrained embeddings out of {len(word2idx)} words")
    return torch.tensor(embeddings, dtype=torch.float)

# -----------------------------
# 4. Training Function
# -----------------------------
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device, preprocessor, config):
    best_f1 = 0
    patience = 5
    patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []

        current_lr = optimizer.param_groups[0]['lr']
        print(f'Epoch {epoch+1}/{num_epochs}, LR: {current_lr:.6f}:')
        
        for batch_texts, batch_labels in train_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            train_preds.extend(predictions.cpu().numpy())
            train_labels.extend(batch_labels.cpu().numpy())
        
        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for batch_texts, batch_labels in val_loader:
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                outputs = model(batch_texts)
                loss = criterion(outputs, batch_labels)
                val_loss += loss.item()
                predictions = torch.argmax(outputs, dim=1)
                val_preds.extend(predictions.cpu().numpy())
                val_labels.extend(batch_labels.cpu().numpy())
        
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_preds, average='binary')
        train_accuracy = accuracy_score(train_labels, train_preds)
        
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='macro', zero_division=1)
        val_accuracy = accuracy_score(val_labels, val_preds)
        
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}, F1: {train_f1:.4f}')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}')
        print(f'Val Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n')
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save({
                'model_state_dict': model.state_dict(),
                'preprocessor': preprocessor,
                'config': config
            }, 'best_model.pt')
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f'Early stopping triggered after epoch {epoch+1}')
            break

# -----------------------------
# 5. Configuration and Data Loading
# -----------------------------
config = {
    'max_vocab_size': 15000,
    'max_seq_length': 128,
    'embed_dim': 300,  # Must match the dimension of the pretrained embeddings
    'hidden_dim': 256,
    'batch_size': 16,
    'learning_rate': 0.001,
    'num_epochs': 20,
    'num_lstm_layers': 2,
    'pretrained_embedding_path': '/Users/User/CSProjects/CSC392_AI_agent/emphatic-AI-Winter2025/glove.6B/glove.6B.300d.txt'  # Update this path as needed
}

# Load your datasets (assumed to be in TSV format)
train_df = pd.read_csv('train_synthetic75.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

# Map your string labels to numerical values
label_mapping = {'NOCUOUS': 0, 'INNOCUOUS': 1}
train_df['Detected as'] = train_df['Detected as'].map(label_mapping)
test_df['Detected as'] = test_df['Detected as'].map(label_mapping)

# -----------------------------
# 6. Preprocessing
# -----------------------------
preprocessor = TextPreprocessor(max_vocab_size=config['max_vocab_size'], 
                                max_seq_length=config['max_seq_length'])
preprocessor.fit(train_df['Sentence'])

X_train = preprocessor.transform(train_df['Sentence'])
X_val = preprocessor.transform(test_df['Sentence'])
y_train = torch.tensor(train_df['Detected as'].values)
y_val = torch.tensor(test_df['Detected as'].values)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=0)

# -----------------------------
# 7. Model, Optimizer, Scheduler Setup
# -----------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ImprovedTextClassifier(
    vocab_size=len(preprocessor.word2idx),
    embed_dim=config['embed_dim'],
    hidden_dim=config['hidden_dim'],
    num_classes=2,
    num_layers=config['num_lstm_layers']
).to(device)

# Load pretrained embeddings and replace the embedding layer weights
pretrained_weights = load_pretrained_embeddings(config['pretrained_embedding_path'], 
                                                preprocessor.word2idx, 
                                                config['embed_dim'])
model.embedding = nn.Embedding.from_pretrained(pretrained_weights, freeze=False, padding_idx=0)

# Compute class weights to help with class imbalance

labels = y_train.numpy()
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# class_counts = torch.bincount(y_train)
# class_weights = 1.0 / class_counts.float()
# class_weights = class_weights / class_weights.sum()
# class_weights = class_weights.to(device)

#criterion = FocalLoss() #nn.CrossEntropyLoss(weight=class_weights)
labels = y_train.numpy()
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
#  = nn.CrossEntropyLoss(weight=class_weights)
criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean')  # Increase gamma for more focus on hard cases



optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=0.05) # increased weight decay here so ones with larger weight are penalized
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1,threshold=0.01, verbose=True)

# -----------------------------
# 8. Train the Model
# -----------------------------
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
            config['num_epochs'], device, preprocessor, config)


Loading pretrained embeddings...
Found 933 pretrained embeddings out of 1274 words




Epoch 1/20, LR: 0.001000:
Epoch 1/20:
Train Loss: 0.1196, Accuracy: 0.5610, F1: 0.5500
Val Loss: 0.0855, Accuracy: 0.5616, F1: 0.3596
Val Precision: 0.7808, Recall: 0.5000

Epoch 2/20, LR: 0.001000:
Epoch 2/20:
Train Loss: 0.1298, Accuracy: 0.5171, F1: 0.5123
Val Loss: 0.0870, Accuracy: 0.4384, F1: 0.3048
Val Precision: 0.7192, Recall: 0.5000

Epoch 3/20, LR: 0.001000:
Epoch 3/20:
Train Loss: 0.1008, Accuracy: 0.5659, F1: 0.5436
Val Loss: 0.0854, Accuracy: 0.5753, F1: 0.4389
Val Precision: 0.5868, Recall: 0.5225

Epoch 4/20, LR: 0.000500:
Epoch 4/20:
Train Loss: 0.1117, Accuracy: 0.5171, F1: 0.4870
Val Loss: 0.0833, Accuracy: 0.5890, F1: 0.4249
Val Precision: 0.7887, Recall: 0.5312

Epoch 5/20, LR: 0.000500:
Epoch 5/20:
Train Loss: 0.1171, Accuracy: 0.5171, F1: 0.5075
Val Loss: 0.0832, Accuracy: 0.6438, F1: 0.5524
Val Precision: 0.7452, Recall: 0.5972

Epoch 6/20, LR: 0.000500:
Epoch 6/20:
Train Loss: 0.1177, Accuracy: 0.5171, F1: 0.4923
Val Loss: 0.0844, Accuracy: 0.6027, F1: 0.6024
V

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import re
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import ReduceLROnPlateau
# ------------------------------
# Focal Loss Class
# ------------------------------
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        """
        Args:
            alpha (float): Weighting factor for the rare class.
            gamma (float): Focusing parameter to down-weight easy examples.
            reduction (str): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        """
        Args:
            inputs (Tensor): Logits tensor of shape (N, C), where C is the number of classes.
            targets (Tensor): Ground truth labels of shape (N,).
        Returns:
            Tensor: Loss value.
        """
        # Compute the standard cross entropy loss without reduction
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        # Compute the probability of the true class
        pt = torch.exp(-ce_loss)
        # Compute the focal loss
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss



# -----------------------------
# 1. Text Preprocessing Classes
# -----------------------------
class TextPreprocessor:
    def __init__(self, max_vocab_size=15000, max_seq_length=128):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_counts = Counter()
        
    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        text = ' '.join(text.split())
        return text
        
    def fit(self, texts):
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            self.word_counts.update(words)
        
        # Reserve two indices for PAD and UNK tokens
        vocab_words = [word for word, count in self.word_counts.most_common(self.max_vocab_size - 2)]
        for word in vocab_words:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def transform(self, texts):
        sequences = []
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            # Truncate or pad sequences
            seq = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words[:self.max_seq_length]]
            seq = seq + [self.word2idx['<PAD>']] * (self.max_seq_length - len(seq))
            sequences.append(seq)
        return torch.tensor(sequences)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# -----------------------------
# 2. Define the Model
# -----------------------------
class ImprovedTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
        super(ImprovedTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.7) # increased rate
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True,
                            dropout=0.2 if num_layers > 1 else 0)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(0.5) # increased rate
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(0.5) # increased rate
        self.fc3 = nn.Linear(hidden_dim // 2, num_classes)
        
    def attention_net(self, lstm_output):
        # Compute attention weights and context vector
        attn_scores = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(attn_scores, dim=1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        return context_vector
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention_net(lstm_out)
        x = self.fc1(attn_out)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# -----------------------------
# 3. Function to Load Pretrained Embeddings (GloVe)
# -----------------------------
def load_pretrained_embeddings(embedding_path, word2idx, embed_dim):
    print("Loading pretrained embeddings...")
    # Initialize embeddings with a uniform distribution
    embeddings = np.random.uniform(-0.05, 0.05, (len(word2idx), embed_dim))
    embeddings[word2idx['<PAD>']] = np.zeros(embed_dim)
    found = 0
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] != embed_dim:
                continue  # Skip if dimensions mismatch
            if word in word2idx:
                embeddings[word2idx[word]] = vector
                found += 1
    print(f"Found {found} pretrained embeddings out of {len(word2idx)} words")
    return torch.tensor(embeddings, dtype=torch.float)

# -----------------------------
# 4. Training Function
# -----------------------------
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device, preprocessor, config):
    best_f1 = 0
    patience = 5
    patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []

        current_lr = optimizer.param_groups[0]['lr']
        print(f'Epoch {epoch+1}/{num_epochs}, LR: {current_lr:.6f}:')
        
        for batch_texts, batch_labels in train_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            train_preds.extend(predictions.cpu().numpy())
            train_labels.extend(batch_labels.cpu().numpy())
        
        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for batch_texts, batch_labels in val_loader:
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                outputs = model(batch_texts)
                loss = criterion(outputs, batch_labels)
                val_loss += loss.item()
                predictions = torch.argmax(outputs, dim=1)
                val_preds.extend(predictions.cpu().numpy())
                val_labels.extend(batch_labels.cpu().numpy())
        
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_preds, average='binary')
        train_accuracy = accuracy_score(train_labels, train_preds)
        
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='macro', zero_division=1)
        val_accuracy = accuracy_score(val_labels, val_preds)
        
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}, F1: {train_f1:.4f}')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}')
        print(f'Val Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n')
        
        # if val_f1 > best_f1:
        #     best_f1 = val_f1
        #     torch.save({
        #         'model_state_dict': model.state_dict(),
        #         'preprocessor': preprocessor,
        #         'config': config
        #     }, 'best_model.pt')
        #     patience_counter = 0
        # else:
        #     patience_counter += 1
            
        # if patience_counter >= patience:
        #     print(f'Early stopping triggered after epoch {epoch+1}')
        #     break

# -----------------------------
# 5. Configuration and Data Loading
# -----------------------------
config = {
    'max_vocab_size': 15000,
    'max_seq_length': 128,
    'embed_dim': 300,  # Must match the dimension of the pretrained embeddings
    'hidden_dim': 256,
    'batch_size': 16,
    'learning_rate': 0.001,
    'num_epochs': 50,
    'num_lstm_layers': 2,
    'pretrained_embedding_path': '/Users/User/CSProjects/CSC392_AI_agent/emphatic-AI-Winter2025/glove.6B/glove.6B.300d.txt'  # Update this path as needed
}

# Load your datasets (assumed to be in TSV format)
train_df = pd.read_csv('train_synthetic75.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

# Map your string labels to numerical values
label_mapping = {'NOCUOUS': 0, 'INNOCUOUS': 1}
train_df['Detected as'] = train_df['Detected as'].map(label_mapping)
test_df['Detected as'] = test_df['Detected as'].map(label_mapping)

# -----------------------------
# 6. Preprocessing
# -----------------------------
preprocessor = TextPreprocessor(max_vocab_size=config['max_vocab_size'], 
                                max_seq_length=config['max_seq_length'])
preprocessor.fit(train_df['Sentence'])

X_train = preprocessor.transform(train_df['Sentence'])
X_val = preprocessor.transform(test_df['Sentence'])
y_train = torch.tensor(train_df['Detected as'].values)
y_val = torch.tensor(test_df['Detected as'].values)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=0)

# -----------------------------
# 7. Model, Optimizer, Scheduler Setup
# -----------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ImprovedTextClassifier(
    vocab_size=len(preprocessor.word2idx),
    embed_dim=config['embed_dim'],
    hidden_dim=config['hidden_dim'],
    num_classes=2,
    num_layers=config['num_lstm_layers']
).to(device)

# Load pretrained embeddings and replace the embedding layer weights
pretrained_weights = load_pretrained_embeddings(config['pretrained_embedding_path'], 
                                                preprocessor.word2idx, 
                                                config['embed_dim'])
model.embedding = nn.Embedding.from_pretrained(pretrained_weights, freeze=False, padding_idx=0)

# Compute class weights to help with class imbalance

labels = y_train.numpy()
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# class_counts = torch.bincount(y_train)
# class_weights = 1.0 / class_counts.float()
# class_weights = class_weights / class_weights.sum()
# class_weights = class_weights.to(device)

#criterion = FocalLoss() #nn.CrossEntropyLoss(weight=class_weights)
labels = y_train.numpy()
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
#  = nn.CrossEntropyLoss(weight=class_weights)
criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean')  # Increase gamma for more focus on hard cases



optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=0.05) # increased weight decay here so ones with larger weight are penalized
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1,threshold=0.01, verbose=True)

# -----------------------------
# 8. Train the Model
# -----------------------------
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
            config['num_epochs'], device, preprocessor, config)


Loading pretrained embeddings...
Found 933 pretrained embeddings out of 1274 words
Epoch 1/50, LR: 0.001000:




Epoch 1/50:
Train Loss: 0.1532, Accuracy: 0.5122, F1: 0.5283
Val Loss: 0.0854, Accuracy: 0.5616, F1: 0.3596
Val Precision: 0.7808, Recall: 0.5000

Epoch 2/50, LR: 0.001000:
Epoch 2/50:
Train Loss: 0.1105, Accuracy: 0.6146, F1: 0.5730
Val Loss: 0.0855, Accuracy: 0.5616, F1: 0.3596
Val Precision: 0.7808, Recall: 0.5000

Epoch 3/50, LR: 0.001000:
Epoch 3/50:
Train Loss: 0.0982, Accuracy: 0.5951, F1: 0.5951
Val Loss: 0.0852, Accuracy: 0.5616, F1: 0.3596
Val Precision: 0.7808, Recall: 0.5000

Epoch 4/50, LR: 0.000500:
Epoch 4/50:
Train Loss: 0.1259, Accuracy: 0.5805, F1: 0.5700
Val Loss: 0.0860, Accuracy: 0.5616, F1: 0.5473
Val Precision: 0.6184, Recall: 0.5926

Epoch 5/50, LR: 0.000500:
Epoch 5/50:
Train Loss: 0.0959, Accuracy: 0.6049, F1: 0.6049
Val Loss: 0.0903, Accuracy: 0.4521, F1: 0.3315
Val Precision: 0.7222, Recall: 0.5122

Epoch 6/50, LR: 0.000250:
Epoch 6/50:
Train Loss: 0.1159, Accuracy: 0.5512, F1: 0.5446
Val Loss: 0.0872, Accuracy: 0.4521, F1: 0.4495
Val Precision: 0.4658, Reca