In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
!pip install peft transformers -q

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import the data
training = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

training['train_test'] = 1
test['train_test'] = 0
all_data = pd.concat([training,test])

print("Import Data Complete")

In [None]:
training.info()

In [None]:
training.head(10)

In [None]:
training.tail(10)

In [None]:
#check for duplicate id's
total_id = len(training["id"])
total_unique_id = len(training["id"].unique())

print("Total number of 'id' duplicates:")
print(total_id - total_unique_id)

In [None]:
#check for null or empty cells
nan_count = training.isna().sum().sum()
null_count = training.isnull().sum().sum()

print('Number of NaN values:', nan_count)
print('Number of null values:', null_count)

In [None]:
#check for consistency of model_a, model_b and LLMs identification
total_unique_model_a = len(training["model_a"].unique())
total_unique_model_b = len(training["model_b"].unique())

print("Total number of 'model_a' and 'model_b' unique values:")
print('model_a =', total_unique_model_a)
print('model_b =', total_unique_model_b)

LLM_a = training["model_a"].unique()
LLM_b = training["model_b"].unique()
LLM = list(set(LLM_a).intersection(set(LLM_b)))

print("total number of LLMs =", len(LLM))
print('LLMs utilized:', LLM)

In [None]:
#check for duplicate prompts -  the same prompt could have been given to more than two different LLMs
total_prompt = len(training["prompt"])
total_unique_prompt = len(training["prompt"].unique())

print("Total number of 'prompt' duplicates:")
print(total_prompt - total_unique_prompt)

#There are 57477 observations and 5743 prompt duplicates without id duplicates -> having prompt duplicates are ok to have 
#and no further data cleaning is needed to deal with prompt duplicates

In [None]:
import re
import nltk
from nltk.corpus import stopwords
#clean cells with text data
def preprocess_text(text):
    #convert text to lower case
    text = text.lower()
    #remove digits and special characters using regular expressions
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    #tokenize the text
    text = nltk.word_tokenize(text)
    
    return text

def remove_stopwords(text):
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    text_no_stopwords = [word for word in text if word not in stop_words]

    return text_no_stopwords


def lemmatization(text):
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatizer_text = [lemmatizer.lemmatize(text) for text in text]

    return lemmatizer_text


def clean_text(text):
    #convert text to lower case, remove digits and special characters using regular expressions and remove stopwords combined together
    text = preprocess_text(text)
    filtered_text = remove_stopwords(text)
    lemmatizer_text = lemmatization(filtered_text)
    clean_text = ' '.join(lemmatizer_text)

    return clean_text

training["prompt"] = training["prompt"].apply(clean_text)
training["response_a"] = training["response_a"].apply(clean_text)
training["response_b"] = training["response_b"].apply(clean_text)


print("Clean Cells with Text Data Complete")

In [None]:
import matplotlib.pyplot as plt

#barchart - model a
result_model_a = training["model_a"].value_counts()
print("model a:", result_model_a)

## Matplotlib barchart:
print("-----")
print("Matplotlib barchart, model a:")
  
barWidth = 0.45
plt.figure(figsize=(15, 7))

plt.bar(result_model_a.index, result_model_a.values, barWidth, color='r')

plt.ylabel('Counts', fontweight ='bold', fontsize = 15)
plt.xlabel('LLMs', fontweight ='bold', fontsize = 15)

plt.xticks(rotation=90)
plt.title('LLMs Value Counts - model a', fontweight ='bold', fontsize = 15)


plt.show()

In [None]:
#barchart - model b
result_model_b = training["model_b"].value_counts()
print("model b:", result_model_b)

## Matplotlib barchart:
print("-----")
print("Matplotlib barchart, model b:")
  
barWidth = 0.45
plt.figure(figsize=(15, 7))

plt.bar(result_model_b.index, result_model_b.values, barWidth, color='g')

plt.ylabel('Counts', fontweight ='bold', fontsize = 15)
plt.xlabel('LLMs', fontweight ='bold', fontsize = 15)

plt.xticks(rotation=90)
plt.title('LLMs Value Counts - model b', fontweight ='bold', fontsize = 15)


plt.show()

In [None]:
#barchart - model winner: winner model a, winner model b or winner tie
def which_winner(value):
    if  value["winner_model_a"] == 1:
         #winner model a
         value["winner_model_b"] = 0
         value["winner_tie"] = 0
         return 0
    elif value["winner_model_b"] == 1:
         #winner model b
         return 1
    elif value["winner_tie"] == 1:
         #winner tie
         return 2
    return None

training["winner"] = training.apply(which_winner, axis=1)

training["winner_model"] = training["winner"].astype(str)
training.loc[training["winner_model"] == "0", "winner_model"] = "model a"
training.loc[training["winner_model"] == "1", "winner_model"] = "model b"
training.loc[training["winner_model"] == "2", "winner_model"] = "winner tie"

result_model_winner = training["winner_model"].value_counts()
print("model winner:", result_model_winner)

print("-----")
print("Matplotlib barchart, model winner:")

barWidth = 0.75
plt.figure(figsize=(8, 7))

plt.bar(result_model_winner.index, result_model_winner.values, barWidth, color='b')

plt.ylabel('Counts', fontweight ='bold', fontsize = 15)
plt.xlabel('Model winner', fontweight ='bold', fontsize = 15)

plt.title('LLMs Value Counts - model winner', fontweight ='bold', fontsize = 15)


plt.show()

In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, LoraConfig, TaskType
from datasets import Dataset

# NUM_LABELS는 승자 클래스 개수: (0: model a, 1: model b, 2: winner tie)
NUM_LABELS = 3 
# DeBERTa 기본 모델 경로 (사전 학습 가중치)
BASE_MODEL_PATH = "/kaggle/input/deberta-v3-base-for-llm-comp/deberta-v3-base-local" 
# LoRA 어댑터 가중치 경로 (파인튜닝된 가벼운 가중치)
LORA_WEIGHTS_PATH = "/kaggle/input/deberta-lora-weight-2/deberta_lora_weights"

train_df = training.copy()
train_df["text"] = train_df["prompt"] + " [SEP] " + train_df["response_a"] + " [SEP] " + train_df["response_b"]
train_df["labels"] = train_df["winner"].astype(int)

hf_train_dataset = Dataset.from_pandas(train_df[['text', 'labels', 'id']])

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
MAX_LENGTH = 512

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH)

tokenized_train_dataset = hf_train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_train_dataset.set_format("torch")

print("DeBERTa 데이터 토큰화 완료")

base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL_PATH,
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True 
)

model = PeftModel.from_pretrained(base_model, LORA_WEIGHTS_PATH)
model.eval() 

print("LoRA DeBERTa 모델 로드 및 추론 모드 설정 완료")

In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["WANDB_DISABLED"] = "true"
DATALOADER_NUM_WORKERS = 0

test_df = test.copy() 
test_df["text"] = test_df["prompt"] + " [SEP] " + test_df["response_a"] + " [SEP] " + test_df["response_b"]

hf_test_dataset = Dataset.from_pandas(test_df[['text', 'id']])
tokenized_test_dataset = hf_test_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_test_dataset.set_format("torch")

#Trainer를 사용한 추론 (Model Prediction)
INFERENCE_BATCH_SIZE = 128
inference_args = TrainingArguments(
    output_dir="./deberta_inference_results",
    per_device_eval_batch_size=INFERENCE_BATCH_SIZE, 
    dataloader_num_workers=DATALOADER_NUM_WORKERS,    
    fp16=True, 
    # report_to="none"
)
model = model.to('cpu').eval()

trainer = Trainer(
    model=model,
    args=inference_args,
    tokenizer=tokenizer,
)

# 추론 실행 (predict_proba와 동일)
raw_predictions = trainer.predict(tokenized_test_dataset)

# 확률값 추출 (로짓(logits)을 softmax를 통해 확률로 변환)
logits = raw_predictions.predictions
probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()

value_test_y_probabilities = probabilities 

print('Model winner prediction, probability', value_test_y_probabilities)

output = pd.DataFrame({
    'id': test_df.id.values,
    'winner_model_a': value_test_y_probabilities[:, 0],
    'winner_model_b': value_test_y_probabilities[:, 1],
    'winner_tie': value_test_y_probabilities[:, 2]
})

output.to_csv('submission.csv', index=False)

print("DeBERTa LoRA 추론 및 Submission 파일 생성 완료")