In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Configuration
RANDOM_SEED = 42
BATCH_SIZE = 16
MAX_LEN = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
NUM_CLASSES = 2

# Set random seeds
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Load dataset
df = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC493/emphatic-AI-Winter2025/ambiguity_model/data/synthetic_data/ambiguous_prompts_dataset_distinct.csv')

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(
    df['text'], df['label'],
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=df['label']
)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create PyTorch Dataset
class AmbiguityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create data loaders
train_dataset = AmbiguityDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = AmbiguityDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE
)

# Create classifier model
class AmbiguityClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AmbiguityClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

model = AmbiguityClassifier(NUM_CLASSES).to(device)

# Set up training
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss().to(device)

# Training loop
def train_model():
    best_accuracy = 0
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)
        
        # Training phase
        model.train()
        train_loss = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        avg_train_loss = train_loss / len(train_loader)
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct_preds = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()
                
                _, preds = torch.max(outputs, dim=1)
                correct_preds += torch.sum(preds == labels)
                
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct_preds.double() / len(val_dataset)
        
        print(f'Train loss: {avg_train_loss:.4f}')
        print(f'Val loss: {avg_val_loss:.4f}')
        print(f'Val accuracy: {val_accuracy:.4f}\n')
        
        if val_accuracy > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_accuracy
            
    print(f'Best validation accuracy: {best_accuracy:.4f}')

# Start training
train_model()

# Evaluation
def evaluate_model(model_path):
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            
    print(classification_report(y_true, y_pred))

evaluate_model('best_model_state.bin')

# Example prediction
def predict_ambiguity(text):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, prediction = torch.max(outputs, dim=1)
        
    return 'Ambiguous' if prediction.item() == 1 else 'Clear'

# Test with sample input
test_text = "How do I fix my computer?"
print(f"Input: '{test_text}'\nPrediction: {predict_ambiguity(test_text)}")

Epoch 1/10
----------
Train loss: 0.3600
Val loss: 0.2187
Val accuracy: 1.0000

Epoch 2/10
----------
Train loss: 0.1664
Val loss: 0.1129
Val accuracy: 1.0000

Epoch 3/10
----------
Train loss: 0.0891
Val loss: 0.0498
Val accuracy: 1.0000

Epoch 4/10
----------
Train loss: 0.0408
Val loss: 0.0226
Val accuracy: 1.0000

Epoch 5/10
----------
Train loss: 0.0232
Val loss: 0.0128
Val accuracy: 1.0000

Epoch 6/10
----------
Train loss: 0.0141
Val loss: 0.0080
Val accuracy: 1.0000

Epoch 7/10
----------
Train loss: 0.0098
Val loss: 0.0058
Val accuracy: 1.0000

Epoch 8/10
----------
Train loss: 0.0069
Val loss: 0.0047
Val accuracy: 1.0000

Epoch 9/10
----------
Train loss: 0.0057
Val loss: 0.0037
Val accuracy: 1.0000

Epoch 10/10
----------
Train loss: 0.0047
Val loss: 0.0031
Val accuracy: 1.0000

Best validation accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        10

  

In [8]:
test_text = "I'm feeling suicidal. I'm going to kill myself now"
print(f"Input: '{test_text}'\nPrediction: {predict_ambiguity(test_text)}")

Input: 'I'm feeling suicidal. I'm going to kill myself now'
Prediction: Clear


: 