In [1]:
# !pip install kagglehub
# !pip install wandb



In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import kagglehub 
from sklearn.metrics import mean_squared_error, mean_absolute_error
import re
from nltk import word_tokenize

In [3]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

my_secret = user_secrets.get_secret("wandb_api_key") 

wandb.login(key=my_secret)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtheepob[0m ([33mtheepob-chulalongkorn-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
path = kagglehub.competition_download('nlp-2025-midterm-kaggle-asas')
df = pd.read_csv(path+'/train.csv')

df["text"] = df["question"] + " [SEP] " + df["answer"]

In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["score"], test_size=0.2, random_state=42)
train_texts = df["text"]
train_labels = df["score"]

In [6]:
model_name = "airesearch/wangchanberta-base-att-spm-uncased"  # Thai transformer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

tokenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

In [7]:
def tokenize_function(texts):
    return tokenizer(texts["text"], padding="max_length", truncation=True, max_length=256)

In [8]:
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

train_dataset = train_dataset.map(tokenize_function)
val_dataset = val_dataset.map(tokenize_function)

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

In [9]:
from datasets import Value

train_dataset = train_dataset.cast_column("label", Value("float32"))
val_dataset = val_dataset.cast_column("label", Value("float32"))

Casting the dataset:   0%|          | 0/362 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/73 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

model.safetensors:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    run_name="score_prediction_run",
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("start")
trainer.train()

start


Epoch,Training Loss,Validation Loss
1,No log,3.305556
2,No log,2.283731
3,No log,2.232065
4,No log,2.246105
5,No log,0.949166
6,No log,0.800766
7,No log,0.828536
8,No log,0.499457
9,No log,0.645906
10,No log,0.521263


TrainOutput(global_step=460, training_loss=1.8508554209833559, metrics={'train_runtime': 134.6005, 'train_samples_per_second': 26.894, 'train_steps_per_second': 3.418, 'total_flos': 476226734315520.0, 'train_loss': 1.8508554209833559, 'epoch': 10.0})

In [12]:
test_df = pd.read_csv(path+"/test.csv")


test_df["text"] = test_df["question"] + " [SEP] " + test_df["answer"]

In [13]:
test_encodings = tokenizer(list(test_df["text"]), padding="max_length", truncation=True, max_length=256, return_tensors="pt")

# Convert to tensor
test_inputs = {key: test_encodings[key] for key in test_encodings if key in ["input_ids", "attention_mask"]}

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
test_inputs = {key: test_inputs[key].to(device) for key in test_inputs}

In [15]:
model.eval()
with torch.no_grad():
    outputs = model(**test_inputs)
    predictions = outputs.logits.squeeze().cpu().numpy()

In [16]:
for i in range(len(predictions)):
    if(predictions[i]<0):
        predictions[i] = 0
    if(predictions[i]>5):
        predictions[i] = 5
    if(predictions[i]%1 >= 0.7):
        predictions[i] = np.ceil(predictions[i])
    if(predictions[i]%1 <= 0.1):
        predictions[i] = np.floor(predictions[i])
    

test_df["score"] = predictions

test_df = test_df[["ID","score"]]
test_df.to_csv('pred.csv', index=False)

In [17]:
print(predictions)

[3.27674    5.         1.3158408  5.         5.         4.
 4.         3.         5.         3.2774699  2.546417   5.
 0.4243345  0.39207804 3.         4.5289826  0.         2.6818724
 5.         0.6082124  4.1233726  5.         3.         3.
 5.         0.         0.58596814 4.388336   2.1272936  0.22516319
 0.         4.649349   4.         2.5875802  5.         0.4570518
 2.54733    4.409026   4.5199823  0.3075513  4.         1.6733953
 5.         1.         3.3135676  2.6075206  5.         3.
 3.         5.         2.         3.         5.         2.2906504
 5.         5.         5.         3.2911398  5.         0.29516318
 1.2865821  2.         1.         2.5253701  0.4096644  4.
 4.320124   5.         5.         5.         4.6281605  4.303789
 1.6291145  5.         2.389432   3.         2.638037   4.
 5.         5.         3.         4.         4.51411    4.1412764
 1.         2.         1.1119697  1.1106194  2.         5.        ]
