In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from collections import Counter
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import re
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold

# -----------------------------
# 1. Text Preprocessing Classes
# -----------------------------
class TextPreprocessor:
    def __init__(self, max_vocab_size=15000, max_seq_length=128):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_counts = Counter()
        
    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        text = ' '.join(text.split())
        return text
        
    def fit(self, texts):
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            self.word_counts.update(words)
        
        # Reserve two indices for PAD and UNK tokens
        vocab_words = [word for word, count in self.word_counts.most_common(self.max_vocab_size - 2)]
        for word in vocab_words:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def transform(self, texts):
        sequences = []
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            # Truncate or pad sequences
            seq = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words[:self.max_seq_length]]
            padded_sequences = np.zeros((len(sequences), self.max_seq_length), dtype=np.int64)
            for i, seq in enumerate(sequences):
                padded_sequences[i, :len(seq)] = seq
        return torch.tensor(sequences)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = self.labels = torch.tensor(labels, dtype=torch.long) # crossEntropyLoss

        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# -----------------------------
# 2. Define the Model
# -----------------------------
class ImprovedTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
        super(ImprovedTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.2 if num_layers > 1 else 0.1)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True,
                            dropout=0.2 if num_layers > 1 else 0)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(hidden_dim // 2, num_classes)
        
    def attention_net(self, lstm_output):
        # Compute attention weights and context vector
        energy = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(self.attention(lstm_output), dim=1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        return context_vector
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention_net(lstm_out)
        x = self.fc1(attn_out)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# -----------------------------
# 3. Function to Load Pretrained Embeddings (GloVe)
# -----------------------------
def load_pretrained_embeddings(embedding_path, word2idx, embed_dim):
    print("Loading pretrained embeddings...")
    # Initialize embeddings with a uniform distribution
    embeddings = np.random.uniform(-0.05, 0.05, (len(word2idx), embed_dim))
    embeddings[word2idx['<PAD>']] = np.zeros(embed_dim)
    found = 0
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] != embed_dim:
                continue  # Skip if dimensions mismatch
            if word in word2idx:
                embeddings[word2idx[word]] = vector
                found += 1
    print(f"Found {found} pretrained embeddings out of {len(word2idx)} words")
    return torch.tensor(embeddings, dtype=torch.float)

# -----------------------------
# 4. Training Function
# -----------------------------
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device, preprocessor, config):
    best_f1 = 0
    patience = 5
    patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []
        
        for batch_texts, batch_labels in train_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            train_preds.extend(predictions.cpu().numpy())
            train_labels.extend(batch_labels.cpu().numpy())
        
        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for batch_texts, batch_labels in val_loader:
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                outputs = model(batch_texts)
                loss = criterion(outputs, batch_labels)
                val_loss += loss.item()
                predictions = torch.argmax(outputs, dim=1)
                val_preds.extend(predictions.cpu().numpy())
                val_labels.extend(batch_labels.cpu().numpy())
        
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_preds, average='binary')
        train_accuracy = accuracy_score(train_labels, train_preds)
        
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='macro', zero_division=1)
        val_accuracy = accuracy_score(val_labels, val_preds)
        
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}, F1: {train_f1:.4f}')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}')
        print(f'Val Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n')
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save({
                'model_state_dict': model.state_dict(),
                'preprocessor': preprocessor,
                'config': config
            }, 'best_model.pt')
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:

            print(f'Early stopping triggered after epoch {epoch+1}')
            break

# -----------------------------
# 5. Configuration and Data Loading
# -----------------------------
config = {
    'max_vocab_size': 15000,
    'max_seq_length': 128,
    'embed_dim': 300,  # Must match the dimension of the pretrained embeddings
    'hidden_dim': 256,
    'batch_size': 16,
    'learning_rate': 0.001,
    'num_epochs': 20,
    'num_lstm_layers': 2,
    'pretrained_embedding_path': '/Users/User/CSProjects/CSC392_AI_agent/emphatic-AI-Winter2025/glove.6B/glove.6B.300d.txt'  # Update this path as needed
}  

# Load your datasets (assumed to be in TSV format)
train_df = pd.read_csv('train.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

# Map your string labels to numerical values
label_mapping = {'NOCUOUS': 0, 'INNOCUOUS': 1}
train_df['Detected as'] = train_df['Detected as'].map(label_mapping)
test_df['Detected as'] = test_df['Detected as'].map(label_mapping)

# -----------------------------
# 6. Preprocessing
# -----------------------------
preprocessor = TextPreprocessor(max_vocab_size=config['max_vocab_size'], 
                                max_seq_length=config['max_seq_length'])
preprocessor.fit(train_df['Sentence'])

X_train = preprocessor.transform(train_df['Sentence'])
X_val = preprocessor.transform(test_df['Sentence'])
y_train = torch.tensor(train_df['Detected as'].values)
y_val = torch.tensor(test_df['Detected as'].values)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

x_np = X_train.numpy()
y_np = y_train.numpy()

# ----------------------------
# 7. 10-Fold Cross Validation
# ----------------------------

num_folds = 10
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

fold_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(x_np, y_np)): 
    print(f"\n------- Fold {fold+1} / {num_folds} --------- ")

    # -----------------------------
    # 1. Split Data
    # -----------------------------
    X_train_fold = torch.tensor(x_np[train_idx])
    y_train_fold = torch.tensor(y_np[train_idx])
    X_val_fold = torch.tensor(x_np[val_idx])
    y_val_fold = torch.tensor(y_np[val_idx])

    train_dataset = TextDataset(X_train_fold, y_train_fold)
    val_dataset = TextDataset(X_val_fold, y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

    # -----------------------------
    # 2. Initialize Model, Optimizer, and Criterion Inside the Loop
    # -----------------------------
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = ImprovedTextClassifier(
        vocab_size=len(preprocessor.word2idx),
        embed_dim=config['embed_dim'],
        hidden_dim=config['hidden_dim'],
        num_classes=2,
        num_layers=config['num_lstm_layers']
    ).to(device)

    # Load pretrained embeddings and replace the embedding layer weights
    pretrained_weights = load_pretrained_embeddings(config['pretrained_embedding_path'], 
                                                    preprocessor.word2idx, 
                                                    config['embed_dim'])
    model.embedding = nn.Embedding.from_pretrained(pretrained_weights, freeze=False, padding_idx=0)

    # Compute class weights to handle class imbalance
    small_factor = 1e-4
    class_counts = torch.bincount(y_train_fold) + small_factor  # Avoid division by zero
    class_weights = 1.0 / class_counts.float()
    class_weights = class_weights / class_weights.sum()
    class_weights = class_weights.to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=0.01)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

    # -----------------------------
    # 3. Train the Model
    # -----------------------------
    train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
                config['num_epochs'], device, preprocessor, config)

    # Load the best model for this fold
    with torch.serialization.safe_globals([TextPreprocessor]):
        checkpoint = torch.load('best_model.pt', weights_only=False)
    
    model.load_state_dict(checkpoint['model_state_dict'])

    # -----------------------------
    # 4. Evaluate the Model
    # -----------------------------
    model.eval()
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for batch_texts, batch_labels in val_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            outputs = model(batch_texts)
            predictions = torch.argmax(outputs, dim=1)
            val_preds.extend(predictions.cpu().numpy())
            val_labels.extend(batch_labels.cpu().numpy())

    # Compute metrics
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    val_accuracy = accuracy_score(val_labels, val_preds)

    print(f"Fold {fold+1} Metrics:")
    print(f"Accuracy: {val_accuracy:.4f}, F1-score: {val_f1:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n")

    fold_results.append((val_accuracy, val_f1, val_precision, val_recall))

# -----------------------------
# 5. Display Overall Results
# -----------------------------
avg_accuracy = sum([x[0] for x in fold_results]) / num_folds
avg_f1 = sum([x[1] for x in fold_results]) / num_folds
avg_precision = sum([x[2] for x in fold_results]) / num_folds
avg_recall = sum([x[3] for x in fold_results]) / num_folds

print("\n========== Final Cross-Validation Results ==========")
print(f"Avg Accuracy: {avg_accuracy:.4f}, Avg F1-score: {avg_f1:.4f}, Avg Precision: {avg_precision:.4f}, Avg Recall: {avg_recall:.4f}")

KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from collections import Counter
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import re
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold

# -----------------------------
# 1. Text Preprocessing Classes
# -----------------------------
class TextPreprocessor:
    def __init__(self, max_vocab_size=15000, max_seq_length=128):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_counts = Counter()
        
    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        text = ' '.join(text.split())
        return text
        
    def fit(self, texts):
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            self.word_counts.update(words)
        
        # Reserve two indices for PAD and UNK tokens
        vocab_words = [word for word, count in self.word_counts.most_common(self.max_vocab_size - 2)]
        for word in vocab_words:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def transform(self, texts):
        sequences = []
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            # Truncate or pad sequences
            seq = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words[:self.max_seq_length]]
            seq = seq + [self.word2idx['<PAD>']] * (self.max_seq_length - len(seq))
            sequences.append(seq)
        return torch.tensor(sequences)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# -----------------------------
# 2. Define the Model
# -----------------------------
class ImprovedTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
        super(ImprovedTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.2 if num_layers > 1 else 0.1)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True,
                            dropout=0.2 if num_layers > 1 else 0)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(hidden_dim // 2, num_classes)
        
    def attention_net(self, lstm_output):
        # Compute attention weights and context vector
        energy = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(self.attention(lstm_output), dim=1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        return context_vector
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention_net(lstm_out)
        x = self.fc1(attn_out)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# -----------------------------
# 3. Function to Load Pretrained Embeddings (GloVe)
# -----------------------------
def load_pretrained_embeddings(embedding_path, word2idx, embed_dim):
    print("Loading pretrained embeddings...")
    # Initialize embeddings with a uniform distribution
    embeddings = np.random.uniform(-0.05, 0.05, (len(word2idx), embed_dim))
    embeddings[word2idx['<PAD>']] = np.zeros(embed_dim)
    found = 0
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] != embed_dim:
                continue  # Skip if dimensions mismatch
            if word in word2idx:
                embeddings[word2idx[word]] = vector
                found += 1
    print(f"Found {found} pretrained embeddings out of {len(word2idx)} words")
    return torch.tensor(embeddings, dtype=torch.float)

# -----------------------------
# 4. Training Function
# -----------------------------
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device, preprocessor, config):
    best_f1 = 0
    #patience = 5
    # patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []
        
        for batch_texts, batch_labels in train_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            train_preds.extend(predictions.cpu().numpy())
            train_labels.extend(batch_labels.cpu().numpy())
        
        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for batch_texts, batch_labels in val_loader:
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                outputs = model(batch_texts)
                loss = criterion(outputs, batch_labels)
                val_loss += loss.item()
                predictions = torch.argmax(outputs, dim=1)
                val_preds.extend(predictions.cpu().numpy())
                val_labels.extend(batch_labels.cpu().numpy())
        
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_preds, average='binary')
        train_accuracy = accuracy_score(train_labels, train_preds)
        
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='macro', zero_division=1)
        val_accuracy = accuracy_score(val_labels, val_preds)
        
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}, F1: {train_f1:.4f}')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}')
        print(f'Val Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n')
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save({
                'model_state_dict': model.state_dict(),
                'preprocessor': preprocessor,
                'config': config
            }, 'best_model.pt')
            #patience_counter = 0
            #patience_counter += 1
            
        # if patience_counter >= patience:

        #     print(f'Early stopping triggered after epoch {epoch+1}')
        #     break

# -----------------------------
# 5. Configuration and Data Loading
# -----------------------------
config = {
    'max_vocab_size': 15000,
    'max_seq_length': 128,
    'embed_dim': 300,  # Must match the dimension of the pretrained embeddings
    'hidden_dim': 256,
    'batch_size': 16,
    'learning_rate': 0.001,
    'num_epochs': 50,
    'num_lstm_layers': 2,
    'pretrained_embedding_path': '/Users/User/CSProjects/CSC392_AI_agent/emphatic-AI-Winter2025/glove.6B/glove.6B.300d.txt'  # Update this path as needed
}  

# Load your datasets (assumed to be in TSV format)
train_df = pd.read_csv('train.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

# Map your string labels to numerical values
label_mapping = {'NOCUOUS': 0, 'INNOCUOUS': 1}
train_df['Detected as'] = train_df['Detected as'].map(label_mapping)
test_df['Detected as'] = test_df['Detected as'].map(label_mapping)

# -----------------------------
# 6. Preprocessing
# -----------------------------
preprocessor = TextPreprocessor(max_vocab_size=config['max_vocab_size'], 
                                max_seq_length=config['max_seq_length'])
preprocessor.fit(train_df['Sentence'])

X_train = preprocessor.transform(train_df['Sentence'])
X_val = preprocessor.transform(test_df['Sentence'])
y_train = torch.tensor(train_df['Detected as'].values)
y_val = torch.tensor(test_df['Detected as'].values)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

x_np = X_train.numpy()
y_np = y_train.numpy()

# ----------------------------
# 7. 10-Fold Cross Validation
# ----------------------------

num_folds = 10
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

fold_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(x_np, y_np)): 
    print(f"\n------- Fold {fold+1} / {num_folds} --------- ")

    # -----------------------------
    # 1. Split Data
    # -----------------------------
    X_train_fold = torch.tensor(x_np[train_idx])
    y_train_fold = torch.tensor(y_np[train_idx])
    X_val_fold = torch.tensor(x_np[val_idx])
    y_val_fold = torch.tensor(y_np[val_idx])

    train_dataset = TextDataset(X_train_fold, y_train_fold)
    val_dataset = TextDataset(X_val_fold, y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

    # -----------------------------
    # 2. Initialize Model, Optimizer, and Criterion Inside the Loop
    # -----------------------------
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = ImprovedTextClassifier(
        vocab_size=len(preprocessor.word2idx),
        embed_dim=config['embed_dim'],
        hidden_dim=config['hidden_dim'],
        num_classes=2,
        num_layers=config['num_lstm_layers']
    ).to(device)

    # Load pretrained embeddings and replace the embedding layer weights
    pretrained_weights = load_pretrained_embeddings(config['pretrained_embedding_path'], 
                                                    preprocessor.word2idx, 
                                                    config['embed_dim'])
    model.embedding = nn.Embedding.from_pretrained(pretrained_weights, freeze=False, padding_idx=0)

    # Compute class weights to handle class imbalance
    small_factor = 1e-4
    class_counts = torch.bincount(y_train_fold) + small_factor  # Avoid division by zero
    class_weights = 1.0 / class_counts.float()
    class_weights = class_weights / class_weights.sum()
    class_weights = class_weights.to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=0.01)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

    # -----------------------------
    # 3. Train the Model
    # -----------------------------
    train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
                config['num_epochs'], device, preprocessor, config)

    # Load the best model for this fold
    with torch.serialization.safe_globals([TextPreprocessor]):
        checkpoint = torch.load('best_model.pt', weights_only=False)
    
    model.load_state_dict(checkpoint['model_state_dict'])

    # -----------------------------
    # 4. Evaluate the Model
    # -----------------------------
    model.eval()
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for batch_texts, batch_labels in val_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            outputs = model(batch_texts)
            predictions = torch.argmax(outputs, dim=1)
            val_preds.extend(predictions.cpu().numpy())
            val_labels.extend(batch_labels.cpu().numpy())

    # Compute metrics
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    val_accuracy = accuracy_score(val_labels, val_preds)

    print(f"Fold {fold+1} Metrics:")
    print(f"Accuracy: {val_accuracy:.4f}, F1-score: {val_f1:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n")

    fold_results.append((val_accuracy, val_f1, val_precision, val_recall))

# -----------------------------
# 5. Display Overall Results
# -----------------------------
avg_accuracy = sum([x[0] for x in fold_results]) / num_folds
avg_f1 = sum([x[1] for x in fold_results]) / num_folds
avg_precision = sum([x[2] for x in fold_results]) / num_folds
avg_recall = sum([x[3] for x in fold_results]) / num_folds

print("\n========== Final Cross-Validation Results ==========")
print(f"Avg Accuracy: {avg_accuracy:.4f}, Avg F1-score: {avg_f1:.4f}, Avg Precision: {avg_precision:.4f}, Avg Recall: {avg_recall:.4f}")


------- Fold 1 / 10 --------- 
Loading pretrained embeddings...
Found 781 pretrained embeddings out of 999 words




Epoch 1/50:
Train Loss: 0.7015, Accuracy: 0.5680, F1: 0.3864
Val Loss: 0.6878, Accuracy: 0.5714, F1: 0.3636
Val Precision: 0.7857, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6060, Accuracy: 0.6640, F1: 0.6500
Val Loss: 0.6859, Accuracy: 0.5714, F1: 0.3636
Val Precision: 0.7857, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.5496, Accuracy: 0.7200, F1: 0.7287
Val Loss: 0.6887, Accuracy: 0.5714, F1: 0.3636
Val Precision: 0.7857, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.4751, Accuracy: 0.7840, F1: 0.7769
Val Loss: 0.6839, Accuracy: 0.5714, F1: 0.3636
Val Precision: 0.7857, Recall: 0.5000

Epoch 5/50:
Train Loss: 0.3849, Accuracy: 0.8320, F1: 0.8293
Val Loss: 0.6917, Accuracy: 0.6429, F1: 0.6410
Val Precision: 0.6778, Recall: 0.6667

Epoch 6/50:
Train Loss: 0.2100, Accuracy: 0.9520, F1: 0.9508
Val Loss: 0.7215, Accuracy: 0.7857, F1: 0.7754
Val Precision: 0.7889, Recall: 0.7708

Epoch 7/50:
Train Loss: 0.2019, Accuracy: 0.9440, F1: 0.9421
Val Loss: 1.4007, Accuracy: 0.6429, F1: 0.6410
Val Precis



Epoch 1/50:
Train Loss: 0.7198, Accuracy: 0.4720, F1: 0.4107
Val Loss: 0.6954, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.7143, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6222, Accuracy: 0.6240, F1: 0.6619
Val Loss: 0.7187, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.7143, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.5148, Accuracy: 0.7600, F1: 0.7761
Val Loss: 0.7340, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.7143, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.4336, Accuracy: 0.7760, F1: 0.7778
Val Loss: 0.7028, Accuracy: 0.5000, F1: 0.4974
Val Precision: 0.5222, Recall: 0.5208

Epoch 5/50:
Train Loss: 0.2996, Accuracy: 0.8880, F1: 0.8852
Val Loss: 0.7068, Accuracy: 0.3571, F1: 0.2632
Val Precision: 0.1923, Recall: 0.4167

Epoch 6/50:
Train Loss: 0.1879, Accuracy: 0.9200, F1: 0.9194
Val Loss: 0.7248, Accuracy: 0.6429, F1: 0.6410
Val Precision: 0.6429, Recall: 0.6458

Epoch 7/50:
Train Loss: 0.1965, Accuracy: 0.9360, F1: 0.9310
Val Loss: 0.7473, Accuracy: 0.5714, F1: 0.5625
Val Precis



Epoch 1/50:
Train Loss: 0.7481, Accuracy: 0.5920, F1: 0.6107
Val Loss: 0.6899, Accuracy: 0.5714, F1: 0.3636
Val Precision: 0.7857, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6543, Accuracy: 0.5680, F1: 0.5574
Val Loss: 0.6919, Accuracy: 0.5714, F1: 0.3636
Val Precision: 0.7857, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.5499, Accuracy: 0.7680, F1: 0.7642
Val Loss: 0.6998, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.7143, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.4370, Accuracy: 0.8240, F1: 0.8226
Val Loss: 0.7099, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.7143, Recall: 0.5000

Epoch 5/50:
Train Loss: 0.3454, Accuracy: 0.8560, F1: 0.8500
Val Loss: 0.6458, Accuracy: 0.7143, F1: 0.6889
Val Precision: 0.7250, Recall: 0.6875

Epoch 6/50:
Train Loss: 0.2807, Accuracy: 0.9120, F1: 0.9060
Val Loss: 0.6182, Accuracy: 0.6429, F1: 0.6410
Val Precision: 0.6778, Recall: 0.6667

Epoch 7/50:
Train Loss: 0.2092, Accuracy: 0.9440, F1: 0.9402
Val Loss: 0.5835, Accuracy: 0.7857, F1: 0.7846
Val Precis



Epoch 1/50:
Train Loss: 0.8290, Accuracy: 0.4480, F1: 0.4889
Val Loss: 0.6909, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6662, Accuracy: 0.6400, F1: 0.6667
Val Loss: 0.6821, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.6034, Accuracy: 0.6800, F1: 0.6774
Val Loss: 0.6803, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.5533, Accuracy: 0.7600, F1: 0.7368
Val Loss: 0.7021, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 5/50:
Train Loss: 0.3798, Accuracy: 0.8560, F1: 0.8475
Val Loss: 0.7575, Accuracy: 0.2857, F1: 0.2708
Val Precision: 0.2667, Recall: 0.2857

Epoch 6/50:
Train Loss: 0.3840, Accuracy: 0.8480, F1: 0.8319
Val Loss: 0.6998, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.2308, Recall: 0.4286

Epoch 7/50:
Train Loss: 0.2168, Accuracy: 0.9200, F1: 0.9138
Val Loss: 0.5128, Accuracy: 0.7857, F1: 0.7846
Val Precis



Epoch 1/50:
Train Loss: 0.7427, Accuracy: 0.5280, F1: 0.5042
Val Loss: 0.6916, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6173, Accuracy: 0.6880, F1: 0.6929
Val Loss: 0.6900, Accuracy: 0.5714, F1: 0.4750
Val Precision: 0.7692, Recall: 0.5714

Epoch 3/50:
Train Loss: 0.5156, Accuracy: 0.7920, F1: 0.7937
Val Loss: 0.6869, Accuracy: 0.5000, F1: 0.4759
Val Precision: 0.5000, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.3327, Accuracy: 0.8880, F1: 0.8852
Val Loss: 0.6862, Accuracy: 0.5000, F1: 0.4269
Val Precision: 0.5000, Recall: 0.5000

Epoch 5/50:
Train Loss: 0.2323, Accuracy: 0.8960, F1: 0.8960
Val Loss: 0.6861, Accuracy: 0.5000, F1: 0.4269
Val Precision: 0.5000, Recall: 0.5000

Epoch 6/50:
Train Loss: 0.2549, Accuracy: 0.9040, F1: 0.9000
Val Loss: 0.6747, Accuracy: 0.4286, F1: 0.4167
Val Precision: 0.4222, Recall: 0.4286

Epoch 7/50:
Train Loss: 0.1381, Accuracy: 0.9360, F1: 0.9310
Val Loss: 0.8235, Accuracy: 0.5714, F1: 0.5625
Val Precis



Epoch 1/50:
Train Loss: 0.7352, Accuracy: 0.5440, F1: 0.6014
Val Loss: 0.6930, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6727, Accuracy: 0.6160, F1: 0.6250
Val Loss: 0.6982, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.5212, Accuracy: 0.7840, F1: 0.7840
Val Loss: 0.6836, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.3240, Accuracy: 0.8880, F1: 0.8833
Val Loss: 0.6780, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 5/50:
Train Loss: 0.4068, Accuracy: 0.8320, F1: 0.8108
Val Loss: 0.7509, Accuracy: 0.5000, F1: 0.4759
Val Precision: 0.5000, Recall: 0.5000

Epoch 6/50:
Train Loss: 0.4210, Accuracy: 0.8480, F1: 0.8348
Val Loss: 0.6013, Accuracy: 0.7143, F1: 0.6889
Val Precision: 0.8182, Recall: 0.7143

Epoch 7/50:
Train Loss: 0.2127, Accuracy: 0.9360, F1: 0.9322
Val Loss: 0.9870, Accuracy: 0.5714, F1: 0.4750
Val Precis



Epoch 1/50:
Train Loss: 0.7302, Accuracy: 0.5200, F1: 0.5946
Val Loss: 0.6919, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6087, Accuracy: 0.6480, F1: 0.6812
Val Loss: 0.6931, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.4979, Accuracy: 0.7760, F1: 0.7778
Val Loss: 0.7004, Accuracy: 0.4286, F1: 0.3778
Val Precision: 0.3939, Recall: 0.4286

Epoch 4/50:
Train Loss: 0.3233, Accuracy: 0.8800, F1: 0.8760
Val Loss: 0.7666, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.2308, Recall: 0.4286

Epoch 5/50:
Train Loss: 0.2828, Accuracy: 0.8880, F1: 0.8833
Val Loss: 0.8191, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 6/50:
Train Loss: 0.2264, Accuracy: 0.9280, F1: 0.9256
Val Loss: 0.8893, Accuracy: 0.2857, F1: 0.2857
Val Precision: 0.2857, Recall: 0.2857

Epoch 7/50:
Train Loss: 0.1935, Accuracy: 0.9600, F1: 0.9587
Val Loss: 1.0873, Accuracy: 0.3571, F1: 0.3262
Val Precis



Epoch 1/50:
Train Loss: 0.7082, Accuracy: 0.5440, F1: 0.5043
Val Loss: 0.6938, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.7081, Accuracy: 0.6080, F1: 0.5739
Val Loss: 0.6963, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.2308, Recall: 0.4286

Epoch 3/50:
Train Loss: 0.5631, Accuracy: 0.7600, F1: 0.7500
Val Loss: 0.7010, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.2308, Recall: 0.4286

Epoch 4/50:
Train Loss: 0.4358, Accuracy: 0.8000, F1: 0.7863
Val Loss: 0.6883, Accuracy: 0.4286, F1: 0.3000
Val Precision: 0.2308, Recall: 0.4286

Epoch 5/50:
Train Loss: 0.2713, Accuracy: 0.9120, F1: 0.9120
Val Loss: 0.6462, Accuracy: 0.5714, F1: 0.4750
Val Precision: 0.7692, Recall: 0.5714

Epoch 6/50:
Train Loss: 0.2369, Accuracy: 0.9200, F1: 0.9153
Val Loss: 0.6185, Accuracy: 0.6429, F1: 0.6410
Val Precision: 0.6458, Recall: 0.6429

Epoch 7/50:
Train Loss: 0.1806, Accuracy: 0.9200, F1: 0.9153
Val Loss: 1.4266, Accuracy: 0.6429, F1: 0.6257
Val Precis



Epoch 1/50:
Train Loss: 0.7126, Accuracy: 0.5440, F1: 0.5128
Val Loss: 0.6914, Accuracy: 0.7143, F1: 0.6889
Val Precision: 0.8182, Recall: 0.7143

Epoch 2/50:
Train Loss: 0.6135, Accuracy: 0.6800, F1: 0.6610
Val Loss: 0.6976, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.5365, Accuracy: 0.7760, F1: 0.7742
Val Loss: 0.6890, Accuracy: 0.5000, F1: 0.3333
Val Precision: 0.7500, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.3348, Accuracy: 0.8720, F1: 0.8667
Val Loss: 0.7330, Accuracy: 0.5714, F1: 0.5625
Val Precision: 0.5778, Recall: 0.5714

Epoch 5/50:
Train Loss: 0.1909, Accuracy: 0.9440, F1: 0.9381
Val Loss: 0.8009, Accuracy: 0.2857, F1: 0.2708
Val Precision: 0.2667, Recall: 0.2857

Epoch 6/50:
Train Loss: 0.1983, Accuracy: 0.9280, F1: 0.9231
Val Loss: 0.7938, Accuracy: 0.5714, F1: 0.5333
Val Precision: 0.6061, Recall: 0.5714

Epoch 7/50:
Train Loss: 0.1532, Accuracy: 0.9440, F1: 0.9431
Val Loss: 1.1920, Accuracy: 0.5000, F1: 0.4759
Val Precis



Epoch 1/50:
Train Loss: 0.7171, Accuracy: 0.5397, F1: 0.5606
Val Loss: 0.6904, Accuracy: 0.5385, F1: 0.3500
Val Precision: 0.7692, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.6581, Accuracy: 0.6270, F1: 0.6713
Val Loss: 0.6937, Accuracy: 0.4615, F1: 0.3158
Val Precision: 0.7308, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.5107, Accuracy: 0.7460, F1: 0.7460
Val Loss: 0.6833, Accuracy: 0.5385, F1: 0.3500
Val Precision: 0.7692, Recall: 0.5000

Epoch 4/50:
Train Loss: 0.4630, Accuracy: 0.8254, F1: 0.8281
Val Loss: 0.7339, Accuracy: 0.4615, F1: 0.3158
Val Precision: 0.7308, Recall: 0.5000

Epoch 5/50:
Train Loss: 0.4220, Accuracy: 0.8095, F1: 0.8065
Val Loss: 0.6545, Accuracy: 0.4615, F1: 0.3158
Val Precision: 0.2500, Recall: 0.4286

Epoch 6/50:
Train Loss: 0.2006, Accuracy: 0.9683, F1: 0.9661
Val Loss: 0.7229, Accuracy: 0.3077, F1: 0.2909
Val Precision: 0.2875, Recall: 0.2976

Epoch 7/50:
Train Loss: 0.1458, Accuracy: 0.9603, F1: 0.9593
Val Loss: 1.5312, Accuracy: 0.4615, F1: 0.3158
Val Precis

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from collections import Counter
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import re
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import sys


# -----------------------------
# 1. Text Preprocessing Classes
# -----------------------------
class TextPreprocessor:
    def __init__(self, max_vocab_size=15000, max_seq_length=128):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_counts = Counter()
        
    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        text = ' '.join(text.split())
        return text
        
    def fit(self, texts):
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            self.word_counts.update(words)
        
        # Reserve two indices for PAD and UNK tokens
        vocab_words = [word for word, count in self.word_counts.most_common(self.max_vocab_size - 2)]
        for word in vocab_words:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def transform(self, texts):
        sequences = []
        for text in texts:
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            # Truncate or pad sequences
            seq = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words[:self.max_seq_length]]
            seq = seq + [self.word2idx['<PAD>']] * (self.max_seq_length - len(seq))
            sequences.append(seq)
        return torch.tensor(sequences)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# -----------------------------
# 2. Define the Model
# -----------------------------
class ImprovedTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2):
        super(ImprovedTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.2 if num_layers > 1 else 0.1)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True,
                            dropout=0.2 if num_layers > 1 else 0)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(hidden_dim // 2, num_classes)
        
    def attention_net(self, lstm_output):
        # Compute attention weights and context vector
        energy = torch.tanh(self.attention(lstm_output))
        attention_weights = torch.softmax(self.attention(lstm_output), dim=1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        return context_vector
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention_net(lstm_out)
        x = self.fc1(attn_out)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# -----------------------------
# 3. Function to Load Pretrained Embeddings (GloVe)
# -----------------------------
def load_pretrained_embeddings(embedding_path, word2idx, embed_dim):
    print("Loading pretrained embeddings...")
    sys.stdout.flush()
    # Initialize embeddings with a uniform distribution
    embeddings = np.random.uniform(-0.05, 0.05, (len(word2idx), embed_dim))
    embeddings[word2idx['<PAD>']] = np.zeros(embed_dim)
    found = 0
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] != embed_dim:
                continue  # Skip if dimensions mismatch
            if word in word2idx:
                embeddings[word2idx[word]] = vector
                found += 1
    print(f"Found {found} pretrained embeddings out of {len(word2idx)} words", flush=True)
    sys.stdout.flush()
    return torch.tensor(embeddings, dtype=torch.float)

# -----------------------------
# 4. Training Function
# -----------------------------
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device, preprocessor, config):
    best_f1 = 0
    #patience = 5
    # patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []
        
        for batch_texts, batch_labels in train_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            train_preds.extend(predictions.cpu().numpy())
            train_labels.extend(batch_labels.cpu().numpy())
        
        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for batch_texts, batch_labels in val_loader:
                batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
                outputs = model(batch_texts)
                loss = criterion(outputs, batch_labels)
                val_loss += loss.item()
                predictions = torch.argmax(outputs, dim=1)
                val_preds.extend(predictions.cpu().numpy())
                val_labels.extend(batch_labels.cpu().numpy())
        
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_preds, average='binary')
        train_accuracy = accuracy_score(train_labels, train_preds)
        
        val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='macro', zero_division=1)
        val_accuracy = accuracy_score(val_labels, val_preds)
        
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}:', flush=True)
        sys.stdout.flush()

        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}, F1: {train_f1:.4f}', flush=True)
        sys.stdout.flush()

        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}', flush=True)
        sys.stdout.flush()
        print(f'Val Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n', flush=True)
        sys.stdout.flush()
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save({
                'model_state_dict': model.state_dict(),
                'preprocessor': preprocessor,
                'config': config
            }, 'best_model.pt')
            #patience_counter = 0
            #patience_counter += 1
            
        # if patience_counter >= patience:

        #     print(f'Early stopping triggered after epoch {epoch+1}')
        #     break

# -----------------------------
# 5. Configuration and Data Loading
# -----------------------------
config = {
    'max_vocab_size': 15000,
    'max_seq_length': 128,
    'embed_dim': 300,  # Must match the dimension of the pretrained embeddings
    'hidden_dim': 256,
    'batch_size': 16,
    'learning_rate': 0.001,
    'num_epochs': 50,
    'num_lstm_layers': 2,
    'pretrained_embedding_path': '/Users/User/CSProjects/CSC392_AI_agent/emphatic-AI-Winter2025/glove.6B/glove.6B.300d.txt'  # Update this path as needed
}  

# Load your datasets (assumed to be in TSV format)
train_df = pd.read_csv('train.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

# Map your string labels to numerical values
label_mapping = {'NOCUOUS': 0, 'INNOCUOUS': 1}
train_df['Detected as'] = train_df['Detected as'].map(label_mapping)
test_df['Detected as'] = test_df['Detected as'].map(label_mapping)

# -----------------------------
# 6. Preprocessing
# -----------------------------
preprocessor = TextPreprocessor(max_vocab_size=config['max_vocab_size'], 
                                max_seq_length=config['max_seq_length'])
preprocessor.fit(train_df['Sentence'])

X_train = preprocessor.transform(train_df['Sentence'])
X_val = preprocessor.transform(test_df['Sentence'])
y_train = torch.tensor(train_df['Detected as'].values)
y_val = torch.tensor(test_df['Detected as'].values)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

x_np = X_train.numpy()
y_np = y_train.numpy()

# ----------------------------
# 7. 10-Fold Cross Validation
# ----------------------------

num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

fold_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(x_np, y_np)): #here
    print(f"\n------- Fold {fold+1} / {num_folds} --------- ", flush=True)
    sys.stdout.flush()


    # -----------------------------
    # 1. Split Data
    # -----------------------------
    X_train_fold = torch.tensor(x_np[train_idx])
    y_train_fold = torch.tensor(y_np[train_idx])
    X_val_fold = torch.tensor(x_np[val_idx])
    y_val_fold = torch.tensor(y_np[val_idx])

    train_dataset = TextDataset(X_train_fold, y_train_fold)
    val_dataset = TextDataset(X_val_fold, y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

    # -----------------------------
    # 2. Initialize Model, Optimizer, and Criterion Inside the Loop
    # -----------------------------
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = ImprovedTextClassifier(
        vocab_size=len(preprocessor.word2idx),
        embed_dim=config['embed_dim'],
        hidden_dim=config['hidden_dim'],
        num_classes=2,
    ).to(device)

    # Load pretrained embeddings and replace the embedding layer weights
    pretrained_weights = load_pretrained_embeddings(config['pretrained_embedding_path'], 
                                                    preprocessor.word2idx, 
                                                    config['embed_dim'])
    model.embedding = nn.Embedding.from_pretrained(pretrained_weights, freeze=False, padding_idx=0)

    # Compute class weights to handle class imbalance
    small_factor = 1e-4
    class_counts = torch.bincount(y_train_fold) + small_factor  # Avoid division by zero
    class_weights = 1.0 / class_counts.float()
    class_weights = class_weights / class_weights.sum()
    class_weights = class_weights.to(device)

    # criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=0.01)
    # scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)


    # -----------------------------
    # 3. Train the Model
    # -----------------------------
    train_model(model, train_loader, val_loader, criterion, optimizer, scheduler,
                config['num_epochs'], device, preprocessor, config)

    # Load the best model for this fold
    with torch.serialization.safe_globals([TextPreprocessor]):
        checkpoint = torch.load('best_model.pt', weights_only=False)

    # Save best model after training
    # torch.save(model.state_dict(), f'best_model_fold{fold+1}.pt')

    # # Load best model for evaluation
    # model.load_state_dict(torch.load(f'best_model_fold{fold+1}.pt'))


    #model.load_state_dict(checkpoint['model_state_dict'])

    # -----------------------------
    # 4. Evaluate the Model
    # -----------------------------
    model.eval()
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for batch_texts, batch_labels in val_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            outputs = model(batch_texts)
            predictions = torch.argmax(outputs, dim=1)
            val_preds.extend(predictions.cpu().numpy())
            val_labels.extend(batch_labels.cpu().numpy())

    # Compute metrics
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    val_accuracy = accuracy_score(val_labels, val_preds)

    print(f"Fold {fold+1} Metrics:", flush=True)
    sys.stdout.flush()
    print(f"Accuracy: {val_accuracy:.4f}, F1-score: {val_f1:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}\n", flush=True)
    sys.stdout.flush()


    fold_results.append((val_accuracy, val_f1, val_precision, val_recall))

# -----------------------------
# 5. Display Overall Results
# -----------------------------

avg_results = np.mean(fold_results, axis=0)
print(f"\nAverage Metrics Over {num_folds} Folds:")
sys.stdout.flush()

print(f"Accuracy: {avg_results[0]:.4f}, F1-score: {avg_results[1]:.4f}, Precision: {avg_results[2]:.4f}, Recall: {avg_results[3]:.4f}")
sys.stdout.flush()


avg_accuracy = sum([x[0] for x in fold_results]) / num_folds
avg_f1 = sum([x[1] for x in fold_results]) / num_folds
avg_precision = sum([x[2] for x in fold_results]) / num_folds
avg_recall = sum([x[3] for x in fold_results]) / num_folds

print("\n========== Final Cross-Validation Results ==========")
sys.stdout.flush()

print(f"Avg Accuracy: {avg_accuracy:.4f}, Avg F1-score: {avg_f1:.4f}, Avg Precision: {avg_precision:.4f}, Avg Recall: {avg_recall:.4f}")
sys.stdout.flush()



------- Fold 1 / 5 --------- 
Loading pretrained embeddings...
Found 781 pretrained embeddings out of 999 words
Epoch 1/50:
Train Loss: 0.7789, Accuracy: 0.4955, F1: 0.5254
Val Loss: 0.6883, Accuracy: 0.5357, F1: 0.3488
Val Precision: 0.7679, Recall: 0.5000

Epoch 2/50:
Train Loss: 0.5917, Accuracy: 0.7027, F1: 0.7080
Val Loss: 0.6902, Accuracy: 0.5357, F1: 0.3488
Val Precision: 0.7679, Recall: 0.5000

Epoch 3/50:
Train Loss: 0.5127, Accuracy: 0.7748, F1: 0.7706
Val Loss: 0.6946, Accuracy: 0.5357, F1: 0.5303
Val Precision: 0.5500, Recall: 0.5462

Epoch 4/50:
Train Loss: 0.3718, Accuracy: 0.8649, F1: 0.8544
Val Loss: 0.6756, Accuracy: 0.7857, F1: 0.7846
Val Precision: 0.7846, Recall: 0.7846

Epoch 5/50:
Train Loss: 0.2396, Accuracy: 0.9099, F1: 0.9038
Val Loss: 0.6605, Accuracy: 0.8214, F1: 0.8212
Val Precision: 0.8214, Recall: 0.8231

Epoch 6/50:
Train Loss: 0.3198, Accuracy: 0.8649, F1: 0.8624
Val Loss: 0.6719, Accuracy: 0.6429, F1: 0.6410
Val Precision: 0.6578, Recall: 0.6513

Epoch