In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["WANDB_DISABLED"] = "true"


In [2]:
import json
import pandas as pd
from tqdm import tqdm
import datasets
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
import torch
import numpy as np

E0000 00:00:1752902076.214484      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752902076.277487      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

In [4]:
def to_prompt(example):
    options_text = "\n".join([f"{key}. {val}" for key, val in example['options'].items()])
    return f"Question: {example['question']}\nOptions:\n{options_text}\nAnswer:"

In [5]:
def prepare_dataset(file_path):
    examples = load_jsonl(file_path)
    return Dataset.from_list([
        {
            "id": ex["id"],
            "prompt": to_prompt(ex),
            "completion": ex["answer"]
        }
        for ex in examples
    ])

In [6]:
train_easy = prepare_dataset("/kaggle/input/goedel-machines-x-iitm-clinical-llm-challenge/train_easy.jsonl")
train_medium = prepare_dataset("/kaggle/input/goedel-machines-x-iitm-clinical-llm-challenge/train_medium.jsonl")


In [7]:

train_dataset = datasets.concatenate_datasets([train_easy, train_medium])
train_dataset = train_dataset.shuffle(seed=42).select(range(30000))###########


In [8]:
model_id = "/kaggle/input/phi-3-5-mini-instruct/phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)


In [9]:
# --- Improved tokenization with label masking ---
def tokenize(example):
    prompt = example['prompt']
    completion = example['completion']

    prompt_ids = tokenizer(prompt, truncation=True, max_length=512, add_special_tokens=False)
    completion_ids = tokenizer(completion, truncation=True, max_length=16, add_special_tokens=False)

    input_ids = prompt_ids['input_ids'] + completion_ids['input_ids']
    attention_mask = [1] * len(input_ids)
    # Mask prompt tokens in the label
    labels = [-100] * len(prompt_ids['input_ids']) + completion_ids['input_ids']
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


In [10]:
tokenized_dataset = train_dataset.map(tokenize, remove_columns=train_dataset.column_names)


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [11]:
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [12]:
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.float16)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

In [14]:
training_args = TrainingArguments(
    output_dir="./llm-checkpoints",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_dir="./logs",
    save_steps=1000,
    save_total_limit=2,
    fp16=True,
    logging_steps=10,
    report_to="none",
    gradient_accumulation_steps=8,
    seed=42,  # for reproducibility
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=collator
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss
10,1.4444
20,1.2967
30,1.1175
40,1.1014
50,1.0211
60,1.046
70,1.0493
80,0.9856
90,1.0051
100,1.038


In [None]:
def predict_and_save(test_path, output_csv, max_choices=4):
    examples = load_jsonl(test_path)
    results = []
    model.eval()

    for ex in tqdm(examples):
        prompt = to_prompt(ex)
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
        with torch.no_grad():
            outputs = model.generate(input_ids, max_new_tokens=1)
        pred_token = tokenizer.decode(outputs[0][-1], skip_special_tokens=True).strip().upper()

        # Robust fallback: match by key or value, try to recover
        chosen = None
        for k, v in ex["options"].items():
            if pred_token == k.upper() or pred_token == v.upper():
                chosen = k
                break
        if chosen is None:
            # Try partial match
            for k, v in ex["options"].items():
                if pred_token in k.upper() or pred_token in v.upper():
                    chosen = k
                    break
        if chosen is None:
            chosen = list(ex["options"].keys())[0]

        results.append({"id": ex["id"], "answer": chosen})
        pd.DataFrame(results).to_csv(output_csv, index=False)

In [None]:
predict_and_save("/kaggle/input/goedel-machines-x-iitm-clinical-llm-challenge/test_easy.jsonl", "pred_easy.csv", max_choices=4)
predict_and_save("/kaggle/input/goedel-machines-x-iitm-clinical-llm-challenge/test_medium.jsonl", "pred_medium.csv", max_choices=4)
predict_and_save("/kaggle/input/goedel-machines-x-iitm-clinical-llm-challenge/test_hard.jsonl", "pred_hard.csv", max_choices=10)


In [None]:
pred_easy = pd.read_csv("pred_easy.csv")
pred_medium = pd.read_csv("pred_medium.csv")
pred_hard = pd.read_csv("pred_hard.csv")

In [None]:
submission = pd.concat([pred_easy, pred_medium, pred_hard])
submission.to_csv("submission.csv", index=False)
print("submission.csv created with", len(submission), "entries.")