In [1]:
import pandas as pd
from datasets import Dataset

# Load your existing dataset
df = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC493/emphatic-AI-Winter2025/ambiguity_model/data/synthetic_data/ambiguous_prompts_dataset_distinct.csv')

# Format for generative fine-tuning
formatted_data = []
for _, row in df.iterrows():
    text = row['text']
    is_ambiguous = row['label'] == 1
    
    # Create instruction-based examples
    instruction = "Analyze the following prompt and determine if it is ambiguous. If it is ambiguous, explain why and suggest a clarifying question."
    
    if is_ambiguous:
        output = "This prompt is ambiguous. "
        # You can add specific reasons based on patterns in the text
        if "this" in text.lower() or "that" in text.lower():
            output += "It contains demonstrative pronouns without clear referents. "
        elif len(text.split()) < 5:
            output += "It is too brief and lacks necessary context. "
        else:
            output += "It lacks specific details needed for a clear response. "
        
        output += "A good clarifying question would be: "
        if "this" in text.lower() or "that" in text.lower():
            output += f"\"Could you specify what you're referring to in your prompt?\""
        elif "how" in text.lower():
            output += f"\"Could you provide more details about what you're trying to accomplish?\""
        else:
            output += f"\"Could you provide more specific information about your request?\""
    else:
        output = "This prompt is clear and specific. No clarification is needed."
    
    formatted_data.append({
        "instruction": instruction,
        "input": text,
        "output": output
    })

# Create a Hugging Face dataset
gen_dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

# Split into train and validation
gen_dataset = gen_dataset.train_test_split(test_size=0.1)

In [None]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model_name = "google/flan-t5-small"  # You can also try -small or -large depending on your resources
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Prepare the inputs for the model
def preprocess_function(examples):
    # Format: "Instruction: {instruction}\nInput: {input}"
    inputs = [
        f"Instruction: {instruction}\nInput: {input_text}" 
        for instruction, input_text in zip(examples["instruction"], examples["input"])
    ]
    
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True
    )
    
    # Set up the labels (decoder inputs)
    labels = tokenizer(
        examples["output"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    
    model_inputs["labels"] = labels["input_ids"]
    
    # Replace padding token id with -100 so it's ignored in loss calculation
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            -100 if token == tokenizer.pad_token_id else token 
            for token in model_inputs["labels"][i]
        ]
    
    return model_inputs

# Apply preprocessing
tokenized_datasets = gen_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=gen_dataset["train"].column_names
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./ambiguity-detection-model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    report_to="tensorboard",
)

# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the model
model.save_pretrained("./ambiguity-detection-model-final")
tokenizer.save_pretrained("./ambiguity-detection-model-final")



tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]



  0%|          | 0/57 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
