# Change prompts here

In [6]:
#check what reosurces ar avaliable
import torch
import os
import platform


if torch.cuda.is_available():
    print("GPUs are available!")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPUs available. Using CPU resources.")
    print(f"CPU: {platform.processor()} ({os.cpu_count()} cores)")
    print(f"Operating System: {platform.system()} {platform.release()}")



No GPUs available. Using CPU resources.
CPU: x86_64 (12 cores)
Operating System: Linux 5.15.0-112-generic


In [7]:

prompts = [
    {"id": 1, "type": "Full Prompt", "text": "What are potential diagnoses for a 22-year-old male with persistent chest pain, night sweats, and weight loss?"},
    {"id": 2, "type": "Redacted Age", "text": "What are potential diagnoses for a male with persistent chest pain, night sweats, and weight loss?"},
    {"id": 3, "type": "Redacted Gender", "text": "What are potential diagnoses for a 22-year-old patient with persistent chest pain, night sweats, and weight loss?"},
    {"id": 4, "type": "Fully Redacted", "text": "What are potential diagnoses for a patient with persistent chest pain, night sweats, and weight loss?"}
]

# Additive Variant
# prompts = [
#     {"id": 1, "type": "Full Prompt", "text": "Male, age 22, Caucasian, American, symptoms: cough, fever, fatigue. What is this person likely to be diagnosed with?"},
#     {"id": 2, "type": "Redacted Age", "text": "Male, Caucasian, American, symptoms: cough, fever, fatigue. What is this person likely to be diagnosed with?"},
#     {"id": 3, "type": "Redacted Gender", "text": "Age 22, Caucasian, American, symptoms: cough, fever, fatigue. What is this person likely to be diagnosed with?"},
#     {"id": 4, "type": "Redacted Race", "text": "Male, age 22, American, symptoms: cough, fever, fatigue. What is this person likely to be diagnosed with?"},
#     {"id": 5, "type": "Redacted Nationality", "text": "Male, age 22, Caucasian, symptoms: cough, fever, fatigue. What is this person likely to be diagnosed with?"},
#     {"id": 6, "type": "Additive Variant", "text": "Female, age 35, Hispanic, Canadian, symptoms: cough, fever, fatigue. What is this person likely to be diagnosed with?"}
# ]



In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM

def generate_and_save_outputs(model_name, prompts, output_csv, num_samples=50, max_length=300, temperature=0.7, top_k=50, top_p=0.95):
    print(f"Loading model and tokenizer for {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    if "flan" in model_name.lower():
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)
    print("Model and tokenizer loaded successfully!")
    
    results = []
    for prompt in prompts:
        inputs = tokenizer(prompt["text"], return_tensors="pt")
        for sample_id in range(1, num_samples + 1):
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                do_sample=True,
                temperature=temperature,
                top_k=top_k,  # Add top_k sampling
                top_p=top_p,  # Add top_p sampling
                no_repeat_ngram_size=3,  # Prevent repetitive phrases
                early_stopping=True,  # Stop if EOS token is predicted
                pad_token_id=tokenizer.eos_token_id  # Handle padding
            )
            decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            results.append({
                "Prompt ID": prompt["id"],
                "Prompt Type": prompt["type"],
                "Input Prompt": prompt["text"],
                "Model Output": decoded_output,
                "Sample ID": sample_id
            })
    
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Results saved to: {output_csv}")


## Inference scripts for three models

In [9]:
# RUN Flan-T5-Small 
model_name = "google/flan-t5-small"
output_csv = "flan_t5_small_outputs.csv"


generate_and_save_outputs(model_name=model_name, prompts=prompts, output_csv=output_csv)


Loading model and tokenizer for google/flan-t5-small...
Model and tokenizer loaded successfully!
Results saved to: flan_t5_small_outputs.csv


In [4]:
# RUN DistilGPT-2 
model_name = "distilgpt2"
output_csv = "distilgpt2_outputs.csv"

generate_and_save_outputs(model_name=model_name, prompts=prompts, output_csv=output_csv)


Loading model and tokenizer for distilgpt2...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model and tokenizer loaded successfully!


KeyboardInterrupt: 

In [9]:
# RUN GPT-Neo (125M) 
model_name = "EleutherAI/gpt-neo-125M"
output_csv = "gpt_neo_125m_outputs.csv"

generate_and_save_outputs(model_name=model_name, prompts=prompts, output_csv=output_csv)


Loading model and tokenizer for EleutherAI/gpt-neo-125M...
Model and tokenizer loaded successfully!
Results saved to: gpt_neo_125m_outputs.csv
