In [1]:
import pandas as pd
from datasets import Dataset

# Load your existing dataset
df = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC493/emphatic-AI-Winter2025/ambiguity_model/data/synthetic_data/ambiguous_prompts_dataset_distinct.csv')

# Format for generative fine-tuning
formatted_data = []
for _, row in df.iterrows():
    text = row['text']
    is_ambiguous = row['label'] == 1
    
    # Create instruction-based examples
    instruction = "Analyze the following prompt and determine if it is ambiguous. If it is ambiguous, explain why and suggest a clarifying question."
    
    if is_ambiguous:
        output = "This prompt is ambiguous. "
        # You can add specific reasons based on patterns in the text
        if "this" in text.lower() or "that" in text.lower():
            output += "It contains demonstrative pronouns without clear referents. "
        elif len(text.split()) < 5:
            output += "It is too brief and lacks necessary context. "
        else:
            output += "It lacks specific details needed for a clear response. "
        
        output += "A good clarifying question would be: "
        if "this" in text.lower() or "that" in text.lower():
            output += f"\"Could you specify what you're referring to in your prompt?\""
        elif "how" in text.lower():
            output += f"\"Could you provide more details about what you're trying to accomplish?\""
        else:
            output += f"\"Could you provide more specific information about your request?\""
    else:
        output = "This prompt is clear and specific. No clarification is needed."
    
    formatted_data.append({
        "instruction": instruction,
        "input": text,
        "output": output
    })

# Create a Hugging Face dataset
gen_dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

# Split into train and validation
gen_dataset = gen_dataset.train_test_split(test_size=0.1)

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import nltk
from nltk.tokenize import word_tokenize
import os
import gc
import random

# Download NLTK resources (only need to run once)
nltk.download('punkt', quiet=True)

# Set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Configuration
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
MODEL_NAME = "distilbert-base-uncased"  # Lightweight model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUTPUT_DIR = "ambiguity_model"

# Create output directory
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Load and prepare the dataset
print("Loading dataset...")
df = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC493/emphatic-AI-Winter2025/ambiguity_model/data/synthetic_data/ambiguous_prompts_dataset_distinct.csv')

# Define ambiguity types for feature engineering
ambiguity_types = [
    "referential",    # Unclear what a pronoun or descriptor refers to
    "lexical",        # Words with multiple possible meanings
    "syntactic",      # Sentence structure creates multiple interpretations
    "scope",          # Unclear scope of quantifiers or modifiers
    "underspecified", # Missing necessary details
    "vague_term"      # Using inherently vague terms (e.g., "soon", "large")
]

# Function to analyze text and identify potential ambiguities
def analyze_ambiguity(text):
    detected_types = []
    ambiguous_elements = []
    
    # Simple rule-based detection
    tokens = word_tokenize(text.lower())
    
    # Check for referential ambiguity
    if any(word in tokens for word in ["it", "this", "that", "these", "those"]):
        detected_types.append("referential")
        ambiguous_elements.append("unclear pronoun reference")
    
    # Check for vague terms
    vague_terms = ["soon", "many", "few", "several", "some", "a lot", "big", "small"]
    if any(term in ' '.join(tokens) for term in vague_terms):
        detected_types.append("vague_term")
        for term in vague_terms:
            if term in ' '.join(tokens):
                ambiguous_elements.append(f"vague term: {term}")
    
    # Check for underspecification
    if len(tokens) < 7:
        detected_types.append("underspecified")
        ambiguous_elements.append("too brief")
    
    return detected_types, ambiguous_elements

# Enhance dataset with features
print("Enhancing dataset with features...")
features = []
for _, row in df.iterrows():
    text = row['text']
    is_ambiguous = row['label'] == 1
    
    # Analyze ambiguity
    detected_types, ambiguous_elements = analyze_ambiguity(text)
    
    # Create feature dict
    feature = {
        "text": text,
        "is_ambiguous": is_ambiguous,
        "ambiguity_types": detected_types,
        "ambiguous_elements": ambiguous_elements,
        "word_count": len(text.split())
    }
    features.append(feature)

# Create enhanced DataFrame
enhanced_df = pd.DataFrame(features)

# Split into train and validation sets
train_df, val_df = train_test_split(enhanced_df, test_size=0.2, random_state=42, stratify=enhanced_df['is_ambiguous'])

print(f"Training with {len(train_df)} examples, validating with {len(val_df)} examples")

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Create dataset class
class AmbiguityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = AmbiguityDataset(
    texts=train_df['text'].tolist(),
    labels=train_df['is_ambiguous'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_dataset = AmbiguityDataset(
    texts=val_df['text'].tolist(),
    labels=val_df['is_ambiguous'].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2
)

# Load model
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(DEVICE)

# Handle 16-bit precision for efficiency
fp16 = False
if torch.cuda.is_available():
    try:
        from torch.cuda.amp import autocast
        fp16 = True
        print("Using mixed precision training")
    except ImportError:
        print("Mixed precision training not available")

# Prepare optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training function
def train_epoch(model, data_loader, optimizer, scheduler, device, fp16):
    model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Clear gradients
        optimizer.zero_grad()
        
        # Forward pass with or without mixed precision
        if fp16:
            with autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
        
        # Get predictions
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += len(labels)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        losses.append(loss.item())
        
        # Release memory
        del input_ids, attention_mask, labels, outputs, loss
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    return correct_predictions.double() / total_predictions, np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            
            correct_predictions += torch.sum(preds == labels)
            total_predictions += len(labels)
            
            losses.append(loss.item())
            
            all_predictions.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            
            # Release memory
            del input_ids, attention_mask, labels, outputs, loss
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    return (
        correct_predictions.double() / total_predictions,
        np.mean(losses),
        all_predictions,
        all_labels
    )

# Train the model
print("Starting training...")
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    
    # Train
    train_acc, train_loss = train_epoch(
        model=model,
        data_loader=train_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=DEVICE,
        fp16=fp16
    )
    
    print(f'Train loss: {train_loss} | Train accuracy: {train_acc}')
    
    # Evaluate
    val_acc, val_loss, predictions, truth = eval_model(
        model=model,
        data_loader=val_loader,
        device=DEVICE
    )
    
    print(f'Val loss: {val_loss} | Val accuracy: {val_acc}')
    print('\n')
    
    # Save the best model
    if val_acc > best_accuracy:
        print(f"Saving best model with accuracy: {val_acc}")
        best_accuracy = val_acc
        # Save the model
        torch.save(model.state_dict(), f'{OUTPUT_DIR}/best_model.bin')
        # Save the tokenizer
        tokenizer.save_pretrained(f'{OUTPUT_DIR}')
    
    # Force garbage collection
    gc.collect()

# Load the best model for final evaluation
model.load_state_dict(torch.load(f'{OUTPUT_DIR}/best_model.bin'))

# Final evaluation
val_acc, val_loss, predictions, truth = eval_model(
    model=model,
    data_loader=val_loader,
    device=DEVICE
)

# Print classification report
print("Classification Report:")
print(classification_report(truth, predictions))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(truth, predictions))

# Save model and tokenizer
print("Saving model...")
model.save_pretrained(f'{OUTPUT_DIR}/final_model')
tokenizer.save_pretrained(f'{OUTPUT_DIR}/final_model')

print("Training complete!")

# Example of how to use the trained model for inference
def predict_ambiguity(text, model, tokenizer, device):
    # Prepare the text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Get prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
    
    # Get confidence scores
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    confidence = probs[0][preds[0]].item()
    
    # Analyze ambiguity types if predicted as ambiguous
    is_ambiguous = bool(preds[0].item())
    
    if is_ambiguous:
        detected_types, ambiguous_elements = analyze_ambiguity(text)
        
        # Generate clarifying question based on ambiguity type
        clarifying_question = "Could you please clarify your request with more specific details?"
        
        if "referential" in detected_types:
            clarifying_question = "What specifically are you referring to in your request?"
        elif "vague_term" in detected_types:
            vague_words = [elem.split(": ")[1] for elem in ambiguous_elements if "vague term" in elem]
            if vague_words:
                clarifying_question = f"Could you be more specific about what you mean by '{vague_words[0]}'?"
        elif "underspecified" in detected_types:
            clarifying_question = "Could you provide more details about what you're trying to accomplish?"
        
        return {
            "is_ambiguous": True,
            "confidence": confidence,
            "ambiguity_types": detected_types,
            "ambiguous_elements": ambiguous_elements,
            "clarifying_question": clarifying_question
        }
    else:
        return {
            "is_ambiguous": False,
            "confidence": confidence
        }

# Example usage
print("\nTesting the model with example prompts:")
test_prompts = [
    "Write a summary of this article.",
    "Create a detailed analysis of the impact of climate change on global agriculture over the past decade.",
    "How do I do that?",
    "Can you explain the process of photosynthesis in detail?",
    "Fix it for me."
]

for prompt in test_prompts:
    result = predict_ambiguity(prompt, model, tokenizer, DEVICE)
    print(f"\nPrompt: {prompt}")
    print(f"Is ambiguous: {result['is_ambiguous']} (confidence: {result['confidence']:.2f})")
    if result['is_ambiguous']:
        print(f"Ambiguity types: {', '.join(result['ambiguity_types'])}")
        print(f"Clarifying question: {result['clarifying_question']}")

Loading dataset...
Enhancing dataset with features...
Training with 130 examples, validating with 33 examples
Loading tokenizer...




Loading model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...
Epoch 1/3




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'AmbiguityDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

: 