In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

2025-03-16 15:06:04.064366: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-16 15:06:04.071293: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742112364.079576    8317 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742112364.082068    8317 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-16 15:06:04.090734: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
model_name = "airesearch/wangchanberta-base-att-spm-uncased" 
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
def clean_text(text):
    return text.strip()

def read_dataset(qset):
    train_df = pd.read_csv(f"./dataset/processed/train_Q{qset}.csv")
    test_df = pd.read_csv(f"./dataset/processed/test_Q{qset}.csv")
    val_df = pd.read_csv(f"./dataset/processed/valid_Q{qset}.csv")

    train_texts = train_df['answer'].apply(clean_text)
    train_labels = train_df['score'].astype(np.float32)
    val_texts = val_df['answer'].apply(clean_text)
    val_labels = val_df['score'].astype(np.float32)
    test_texts = test_df['answer'].apply(clean_text)
    test_ids = test_df['ID']

    train_texts = pd.concat([train_texts, val_texts])
    train_labels = pd.concat([train_labels, val_labels])

    return train_texts, train_labels, val_texts, val_labels, test_texts, test_ids

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=505)

# Select Question Set

In [33]:
question_set = 4

train_texts, train_labels, val_texts, val_labels, test_texts, test_ids = read_dataset(question_set)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "ID": test_ids})

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [6]:
print(tokenizer.vocab_size)
print(tokenizer.all_special_ids)
print(train_dataset[34])

25005
[5, 6, 3, 1, 25004, 0, 2, 8]
{'text': 'เพราะการนำ A/B testing มาใช้โดยให้ A test เป็นการวางตำแหน่งของโฆษณาไว้ที่เดิม และ B test เป็นการเปลี่ยนตำแหน่งของโฆษณา นั้นจะช่วยให้สามารถนำมาเปรียบเทียบกันได้ว่าการ test ทั้งสองแบบที่ผ่าน randomness มาแบบเดียวกันนั้น แบบใดสามารถเพิ่มจำนวนการคลิกโฆษณาของผู้ใช้ได้มากกว่ากัน', 'label': 1.0, 'input_ids': [5, 474, 4166, 10, 3, 101, 3, 10, 14152, 919, 10, 2530, 4537, 10, 3, 10, 14152, 11764, 727, 340, 16, 2153, 4603, 543, 222, 10, 3, 10, 14152, 10, 17, 1509, 340, 16, 2153, 10, 72, 7009, 6978, 26, 2437, 17559, 2275, 24, 10, 14152, 10, 859, 84, 9180, 10, 12118, 21191, 14141, 10, 26, 84, 276, 72, 10, 84, 609, 151, 286, 207, 24, 4475, 2153, 19038, 8124, 67, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [40]:
# alll = []
# for i in range(len(train_dataset)):
#     for j in range(len(train_dataset[i]['input_ids'])):
#         alll.append(train_dataset[i]['input_ids'][j])
        # if train_dataset[i]['input_ids'][j] >= 25005:
        #     print(i, j)
            # break



In [41]:
# print(np.unique(np.array(alll)))
# print(len(np.unique(np.array(alll))))

In [42]:
# for i in np.unique(np.array(alll)):
#     model(torch.tensor([[i]], device='cuda'))

# Train Model

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
model = model.to("cuda")

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20, # 10 -> 20 -> 20
    run_name="run",
    metric_for_best_model="eval_loss",
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.272119
2,No log,0.327013
3,No log,0.082587
4,No log,0.228686
5,No log,0.066508
6,No log,0.117042
7,No log,0.078869
8,No log,0.053803
9,No log,0.083976
10,No log,0.163678


TrainOutput(global_step=60, training_loss=0.10666704972585042, metrics={'train_runtime': 28.7782, 'train_samples_per_second': 63.242, 'train_steps_per_second': 2.085, 'total_flos': 472310936970600.0, 'train_loss': 0.10666704972585042, 'epoch': 20.0})

In [42]:
model.eval()
predictions = trainer.predict(test_dataset)

In [43]:
print(test_dataset['ID'])
print(predictions.predictions)


[363, 366, 373, 376, 377, 380, 383, 386, 393, 396, 397, 399, 404, 410, 411, 419, 423, 432, 439, 444, 445, 451]
[[5.176763 ]
 [5.2175183]
 [5.1630387]
 [4.9565325]
 [4.7626443]
 [4.7150226]
 [5.062947 ]
 [5.116655 ]
 [4.758366 ]
 [5.076514 ]
 [1.5369546]
 [4.5852284]
 [5.003554 ]
 [3.8017702]
 [5.1215777]
 [4.058332 ]
 [1.3813007]
 [4.441548 ]
 [2.6489275]
 [4.7530046]
 [5.2296114]
 [5.108672 ]]


In [47]:
assert False

AssertionError: 

# Export Prediction

In [44]:
output_path = f"./output_tf/Q{question_set}.csv"
output = []
coll = ['ID', 'Score']
for i in range(len(test_dataset['ID'])):
    # print(f"{test_dataset['ID'][i]},{predictions.predictions[i][0]}")
    output.append([test_dataset['ID'][i], min(max(0, predictions.predictions[i][0]), 5)])

df = pd.DataFrame(output, columns=coll)
df.to_csv(output_path, index=False)

In [45]:
sets = [1, 2, 3, 4]
all_out = []
coll = ['ID', 'Score']
for s in sets:
    output_path = f"./output_tf/Q{s}.csv"
    df = pd.read_csv(output_path)
    sc, idd = df['Score'].tolist(), df['ID'].tolist()
    for i in range(len(sc)):
        all_out.append([idd[i], sc[i]])

all_out = sorted(all_out, key=lambda x: x[0])

df = pd.DataFrame(all_out, columns=coll)
df.to_csv('./output_tf/all.csv', index=False)
print(df)

     ID     Score
0   362  3.421832
1   363  5.000000
2   364  3.506828
3   365  5.000000
4   366  5.000000
..  ...       ...
85  447  1.933424
86  448  0.764716
87  449  1.222936
88  450  1.270083
89  451  5.000000

[90 rows x 2 columns]


In [None]:
assert False

# Restore

in case of new prediction for any question set worser than the previous one, we can restore the previous one.

In [49]:
restore_file = "./output_tf/Q1.csv"
from_file = "./output_tf/all_old.csv"

df = pd.read_csv(restore_file)
df2 = pd.read_csv(from_file)

#iterate over ID
idx = 0
for i in range(len(df)):
    print(df['ID'][i])
    while df['ID'][i] != df2['ID'][idx]:
        idx += 1
    print(df2['ID'][idx])
    print(df['Score'][i], df2['Score'][idx])
    print()
    df.loc[i, 'Score'] = df2['Score'][idx]

df.to_csv(restore_file, index=False)

372
372
2.0915958881378174 2.0915958881378174

374
374
1.0348477363586426 1.0348477363586426

375
375
0.6046752333641052 0.6046752333641052

381
381
1.5637779235839844 1.5637779235839844

390
390
1.01014506816864 1.01014506816864

395
395
2.3856542110443115 2.3856542110443115

401
401
1.934255719184876 1.934255719184876

402
402
4.538719654083252 4.538719654083252

403
403
2.670741081237793 2.670741081237793

405
405
0.8457199335098267 0.8457199335098267

413
413
0.7458707094192505 0.7458707094192505

421
421
1.0416152477264404 1.0416152477264404

422
422
1.340225338935852 1.340225338935852

424
424
1.058066487312317 1.058066487312317

426
426
0.5104495286941528 0.5104495286941528

430
430
5.0 5.0

437
437
3.375184774398804 3.375184774398804

442
442
3.966780662536621 3.966780662536621

446
446
1.9142019748687744 1.9142019748687744

448
448
0.7647156715393066 0.7647156715393066

449
449
1.2229357957839966 1.2229357957839966

450
450
1.2700825929641724 1.2700825929641724

