In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!nvidia-smi

In [None]:
# import pandas as pd

In [None]:
%dir

In [None]:
df= pd.read_csv('../input/map-charting-student-math-misunderstandings/train.csv')

In [None]:
print(df.head(3))

In [None]:
misconceptions = []
for i in range(len(df)):
  if misconceptions.count(df.iloc[i]['Misconception'])==0 and ('Misconception' in df.iloc[i]['Category']) :
    misconceptions.append(df.iloc[i]['Misconception'])
print(len(misconceptions))

In [None]:
df_with_misconceptions = df[df['Misconception'].isin(misconceptions)]

In [None]:
# print(df_with_misconceptions.head(3))
# print(len(df_with_misconceptions))

In [None]:
# df_with_misconceptions.to_csv('train_with_misconceptions.csv')

In [None]:
!pip install -q transformers datasets accelerate peft bitsandbytes trl

In [None]:
!pip install tensorboard

In [None]:
import transformers
import peft
import trl

print(f"Transformers version: {transformers.__version__}")
print(f"PEFT version: {peft.__version__}")
print(f"TRL version: {trl.__version__}")

In [None]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())

if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected!")

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import numpy as np


In [None]:
def preprocess_function(examples, tokenizer):
    prompts = []
    for q_text, mc_answer, explanation, category, misconception in zip(
        examples['QuestionText'],
        examples['MC_Answer'],
        examples['StudentExplanation'],
        examples['Category'],
        examples['Misconception']
    ):
        
        prompt = f"""<|im_start|>system
You are an expert educational psychologist and domain expert. 
Given a multiple-choice question, the student's chosen answer, and the student's explanation, 
explain step-by-step (Chain-of-Thought) whether the provided candidate misconception matches the student's explanation.
Show your reasoning, then give a final answer on the last line as exactly one word: "Yes" or "No".<|im_end|>
<|im_start|>user
Question: {q_text}
Student's Answer: {mc_answer}
Student's Explanation: {explanation}

Does the explanation above demonstrate the misconception '{misconception}' from the category '{category}'?<|im_end|>
<|im_start|>assistant
"""
        prompts.append(prompt)
    
    
    model_inputs = tokenizer(prompts, max_length=1024, truncation=True)
    return model_inputs


In [None]:
@dataclass
class ContrastiveDataCollator:
    tokenizer: AutoTokenizer

    def __call__(self, features: List[Dict[str, any]]) -> Dict[str, any]:
        batch_size = len(features)
        
        
        if batch_size <= 1:
            return self.tokenizer.pad(features, padding=True, return_tensors="pt")
        
        positive_features = features
        negative_indices = [(i + np.random.randint(1, batch_size)) % batch_size for i in range(batch_size)]
        negative_features = [features[i] for i in negative_indices]
        
        all_features = positive_features + negative_features
        
        padded_batch = self.tokenizer.pad(
            {"input_ids": [f["input_ids"] for f in all_features]},
            padding=True,
            return_tensors="pt",
        )
        return padded_batch

In [None]:
class ContrastiveTrainer(Trainer):
    def __init__(self, *args, yes_token_id, no_token_id, margin=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.yes_token_id = yes_token_id
        self.no_token_id = no_token_id
        self.margin = margin
    
    # override
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        logits = outputs.logits

        last_token_logits = logits[:, -1, :]

        yes_scores = last_token_logits[:, self.yes_token_id]
        no_scores = last_token_logits[:, self.no_token_id]
        
        scores = yes_scores - no_scores

        # batch=pos+neg
        batch_size = inputs["input_ids"].shape[0] // 2
        
        if batch_size == 0:
            return super().compute_loss(model, inputs, return_outputs)

        positive_scores = scores[:batch_size]
        negative_scores = scores[batch_size:]

        # hinge loss
        losses = torch.clamp(self.margin - positive_scores + negative_scores, min=0)
        loss = losses.mean()

        return (loss, outputs) if return_outputs else loss

In [None]:
def train_ranker(df):
    # config
    MODEL_NAME= "Qwen/Qwen2.5-7B-Instruct"
    # TRAIN_CSV_PATH = "/train_with_misconceptions.csv" 
    OUTPUT_DIR = "./qwen2-7b-ranker-finetuned"

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    # yes,no token id
    yes_token_id = tokenizer.encode("Yes", add_special_tokens=False)[0]
    no_token_id = tokenizer.encode("No", add_special_tokens=False)[0]
    print(f"Token ID for 'Yes': {yes_token_id}, 'No': {no_token_id}")

    #load dataset
    
    # # df = pd.read_csv(TRAIN_CSV_PATH)
    
    # df = df_with_misconceptions
    raw_dataset = Dataset.from_pandas(df)


    
    tokenized_dataset = raw_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True,
        remove_columns=raw_dataset.column_names # bo cot cu
    )


    
    # load model with LoRA
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True,
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
    )

    # config PEFT(LoRA)
    model = prepare_model_for_kbit_training(model)
    peft_config = LoraConfig(
        r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    )
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False # use cache

    # conf training
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=2, #batch size=2
        gradient_accumulation_steps=4, 
        learning_rate=2e-5,
        num_train_epochs=1,
        logging_steps=10,
        save_strategy="epoch",
        fp16=True,
        dataloader_num_workers=0,
        report_to="tensorboard",
    )
    
    # initialize datacollator and trainer
    data_collator = ContrastiveDataCollator(tokenizer=tokenizer)
    
    trainer = ContrastiveTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        
        yes_token_id=yes_token_id,
        no_token_id=no_token_id,
    )

    # start_training
    print("Start fine-tuning vá»›i Contrastive Loss...\n")
    trainer.train()
    print("Training completed.\n")

    
    # save model
    final_adapter_dir = f"{OUTPUT_DIR}/final_adapters"
    trainer.save_model(final_adapter_dir)
    print(f"Adapters saved at: {final_adapter_dir}")



In [None]:
#training
train_ranker(df_with_misconceptions)

In [None]:
!zip -r qwen2_7b_ranker_finetuned.zip /kaggle/working/qwen2-7b-ranker-finetuned/final_adapters
from IPython.display import FileLink
FileLink('qwen2_7b_ranker_finetuned.zip')