In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Configuration
RANDOM_SEED = 42
BATCH_SIZE = 16
MAX_LEN = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
NUM_CLASSES = 2

# Set random seeds
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Load dataset
df = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC493/emphatic-AI-Winter2025/ambiguity_model/data/synthetic_data/ambiguous_prompts_dataset_distinct.csv')

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(
    df['text'], df['label'],
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=df['label']
)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create PyTorch Dataset
class AmbiguityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create data loaders
train_dataset = AmbiguityDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = AmbiguityDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE
)

# Create classifier model
class AmbiguityClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AmbiguityClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

model = AmbiguityClassifier(NUM_CLASSES).to(device)

# Set up training
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss().to(device)

# Training loop
def train_model():
    best_accuracy = 0
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)
        
        # Training phase
        model.train()
        train_loss = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        avg_train_loss = train_loss / len(train_loader)
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct_preds = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()
                
                _, preds = torch.max(outputs, dim=1)
                correct_preds += torch.sum(preds == labels)
                
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct_preds.double() / len(val_dataset)
        
        print(f'Train loss: {avg_train_loss:.4f}')
        print(f'Val loss: {avg_val_loss:.4f}')
        print(f'Val accuracy: {val_accuracy:.4f}\n')
        
        if val_accuracy > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_accuracy
            
    print(f'Best validation accuracy: {best_accuracy:.4f}')

# Start training
train_model()

# Evaluation
def evaluate_model(model_path):
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    y_true = []
    y_pred = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            
    print(classification_report(y_true, y_pred))

evaluate_model('best_model_state.bin')

# Example prediction
def predict_ambiguity(text):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, prediction = torch.max(outputs, dim=1)
        
    return 'Ambiguous' if prediction.item() == 1 else 'Clear'

# Test with sample input
test_text = "How do I fix my computer?"
print(f"Input: '{test_text}'\nPrediction: {predict_ambiguity(test_text)}")

Epoch 1/10
----------
Train loss: 0.3600
Val loss: 0.2187
Val accuracy: 1.0000

Epoch 2/10
----------
Train loss: 0.1664
Val loss: 0.1129
Val accuracy: 1.0000

Epoch 3/10
----------
Train loss: 0.0891
Val loss: 0.0498
Val accuracy: 1.0000

Epoch 4/10
----------
Train loss: 0.0408
Val loss: 0.0226
Val accuracy: 1.0000

Epoch 5/10
----------
Train loss: 0.0232
Val loss: 0.0128
Val accuracy: 1.0000

Epoch 6/10
----------
Train loss: 0.0141
Val loss: 0.0080
Val accuracy: 1.0000

Epoch 7/10
----------
Train loss: 0.0098
Val loss: 0.0058
Val accuracy: 1.0000

Epoch 8/10
----------
Train loss: 0.0069
Val loss: 0.0047
Val accuracy: 1.0000

Epoch 9/10
----------
Train loss: 0.0057
Val loss: 0.0037
Val accuracy: 1.0000

Epoch 10/10
----------
Train loss: 0.0047
Val loss: 0.0031
Val accuracy: 1.0000

Best validation accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        10

  

In [2]:
test_text = "I'm confused"
print(f"Input: '{test_text}'\nPrediction: {predict_ambiguity(test_text)}")

Input: 'I'm confused'
Prediction: Clear


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import spacy
import re

class ClarificationNeedDetector:
    def __init__(self):
        # Load language model for advanced NLP features
        self.nlp = spacy.load('en_core_web_sm')
        
        # Features to extract
        self.features = {
            'ambiguity_score': self.calculate_ambiguity,
            'vagueness_score': self.calculate_vagueness,
            'complexity_score': self.calculate_complexity,
            'context_sufficiency': self.check_context_sufficiency
        }
    
    def preprocess_text(self, text):
        """
        Preprocess input text for feature extraction
        
        Args:
            text (str): Input query
        
        Returns:
            str: Preprocessed text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        return text
    
    def calculate_ambiguity(self, text):
        """
        Calculate text ambiguity based on multiple potential interpretations
        
        Args:
            text (str): Input query
        
        Returns:
            float: Ambiguity score
        """
        doc = self.nlp(text)
        
        # Check for multiple possible interpretations
        # Look at POS tag diversity and potential word sense ambiguity
        pos_diversity = len(set(token.pos_ for token in doc))
        ambiguous_words = sum(1 for token in doc if len(token._.loaded_lexemes) > 1)
        
        return (pos_diversity + ambiguous_words) / len(doc.tokens)
    
    def calculate_vagueness(self, text):
        """
        Measure vagueness through qualitative language
        
        Args:
            text (str): Input query
        
        Returns:
            float: Vagueness score
        """
        vague_words = [
            'thing', 'stuff', 'something', 'anything', 
            'somewhere', 'somehow', 'kind of', 'sort of'
        ]
        
        vague_word_count = sum(word in text.lower() for word in vague_words)
        return vague_word_count / len(text.split())
    
    def calculate_complexity(self, text):
        """
        Assess query complexity through linguistic features
        
        Args:
            text (str): Input query
        
        Returns:
            float: Complexity score
        """
        doc = self.nlp(text)
        
        # Measure clause complexity
        clause_count = sum(1 for token in doc if token.dep_ == 'ROOT')
        
        # Check for embedded clauses or complex syntax
        embedded_clause_complexity = sum(
            1 for token in doc 
            if token.dep_ in ['acl', 'advcl', 'ccomp', 'xcomp']
        )
        
        return (clause_count + embedded_clause_complexity) / len(doc)
    
    def check_context_sufficiency(self, text):
        """
        Evaluate if query provides sufficient context
        
        Args:
            text (str): Input query
        
        Returns:
            float: Context sufficiency score
        """
        doc = self.nlp(text)
        
        # Check for specific context markers
        context_markers = ['because', 'since', 'although', 'however']
        context_marker_count = sum(1 for token in doc if token.text.lower() in context_markers)
        
        # Check for named entities which provide context
        named_entity_count = len(list(doc.ents))
        
        return (context_marker_count + named_entity_count) / len(doc.tokens)
    
    def extract_features(self, texts):
        """
        Extract features for multiple texts
        
        Args:
            texts (list): List of input queries
        
        Returns:
            pd.DataFrame: Extracted features
        """
        feature_matrix = []
        
        for text in texts:
            preprocessed_text = self.preprocess_text(text)
            text_features = {
                name: func(preprocessed_text) 
                for name, func in self.features.items()
            }
            feature_matrix.append(text_features)
        
        return pd.DataFrame(feature_matrix)
    
    def train_clarification_model(self, training_data, labels):
        """
        Train a classification model to predict clarification need
        
        Args:
            training_data (list): Training queries
            labels (list): Binary labels (need clarification or not)
        
        Returns:
            Pipeline: Trained ML pipeline
        """
        # Extract features
        X = self.extract_features(training_data)
        y = labels
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Create ML pipeline
        pipeline = Pipeline([
            ('classifier', RandomForestClassifier(n_estimators=100))
        ])
        
        # Fit model
        pipeline.fit(X_train, y_train)
        
        # Evaluate
        y_pred = pipeline.predict(X_test)
        print(classification_report(y_test, y_pred))
        
        return pipeline
    
    def predict_clarification_need(self, query, model):
        """
        Predict whether a query needs clarification
        
        Args:
            query (str): Input query
            model (Pipeline): Trained classification model
        
        Returns:
            bool: Whether clarification is needed
        """
        features = self.extract_features([query])
        prediction = model.predict(features)
        return bool(prediction[0])

# Example usage
detector = ClarificationNeedDetector()

# Simulated training data
training_queries = [
    "Tell me about that thing",
    "What's the best way to solve this problem?",
    "I need help with my project",
    "Explain the process of photosynthesis in plants"
]
training_labels = [1, 1, 1, 0]  # 1 means needs clarification

# Train model
clarification_model = detector.train_clarification_model(
    training_queries, 
    training_labels
)

# Test prediction
test_query = "Tell me about something"
needs_clarification = detector.predict_clarification_need(
    test_query, 
    clarification_model
)
print(f"Query needs clarification: {needs_clarification}")

NameError: name 'clarification_model' is not defined