In [241]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [242]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read JSON/ Map Class

In [243]:
json_path = "/content/drive/MyDrive/SuperAI/train.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)


In [244]:
records = []
for entry in data:
    # Get all quotes from the "quotes" list
    quotes_text = []
    for q in entry["quotes"]:
        quotes_text.append(q["quote"])

    # Combine into one string (separated by space or newline)
    combined_quotes = " ".join(quotes_text)

    # Add a record to our list
    records.append({
        "text": combined_quotes,
        "label": entry["case"]
    })

df = pd.DataFrame(records)
print(df.head())


                                                text  label
0  Excuse me? WeII, you Iook normaI. I'm sad. WeI...  Case2
1  You’re not perfect, sport, and let me save you...  Case1
2  I can’t concentrate on anything. we need to ch...  Case7
3  I was starving. Do you want me to make you ano...  Case5
4  Sometimes I wish I could just go to sleep unti...  Case1


In [245]:
mapping_dict = {
    "Case1": 0,
    "Case2": 1,
    "Case3": 2,
    "Case4": 3,
    "Case5": 4,
    "Case6": 5,
    "Case7": 6,
    "Case8": 7,
    "Case9": 8
}


df["label"] = df["label"].map(mapping_dict)


# train val splilt 80:20

In [246]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=42
)


# Convert to Hugging Face Dataset

In [247]:
pip install datasets



In [248]:
# Hugging Face Dataset
from datasets import Dataset
# Convert pyhton dataframe to Hugging Face arrow dataset
hg_train_data = Dataset.from_pandas(train_data)
hg_val_data = Dataset.from_pandas(val_data)


In [249]:
# Length of the Dataset
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 84.



{'text': "I'm not here. This isn't happening. You can't reason your way out of this.",
 'label': 5,
 '__index_level_0__': 55}

In [250]:
# Validate the record in pandas dataframe
df.iloc[[55]]

Unnamed: 0,text,label
55,I'm not here. This isn't happening. You can't ...,5


# Tokenize text

In [251]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

In [252]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Take a look at the tokenizer
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [253]:
# Mapping between special tokens and their IDs.
print(f'The unknown token is {tokenizer.unk_token} and the ID for the unkown token is {tokenizer.unk_token_id}.')
print(f'The seperator token is {tokenizer.sep_token} and the ID for the seperator token is {tokenizer.sep_token_id}.')
print(f'The pad token is {tokenizer.pad_token} and the ID for the pad token is {tokenizer.pad_token_id}.')
print(f'The sentence level classification token is {tokenizer.cls_token} and the ID for the classification token is {tokenizer.cls_token_id}.')
print(f'The mask token is {tokenizer.mask_token} and the ID for the mask token is {tokenizer.mask_token_id}.')

The unknown token is [UNK] and the ID for the unkown token is 100.
The seperator token is [SEP] and the ID for the seperator token is 102.
The pad token is [PAD] and the ID for the pad token is 0.
The sentence level classification token is [CLS] and the ID for the classification token is 101.
The mask token is [MASK] and the ID for the mask token is 103.


In [254]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["text"],
                     max_length=32,
                     truncation=True,
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_val = hg_val_data.map(tokenize_dataset)


Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [255]:
# Take a look at the data
print(dataset_train)
print(dataset_val)

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 84
})
Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 21
})


In [256]:
# Check the first record
dataset_train[0]

{'text': "I'm not here. This isn't happening. You can't reason your way out of this.",
 'label': 5,
 '__index_level_0__': 55,
 'input_ids': [101,
  146,
  112,
  182,
  1136,
  1303,
  119,
  1188,
  2762,
  112,
  189,
  5664,
  119,
  1192,
  1169,
  112,
  189,
  2255,
  1240,
  1236,
  1149,
  1104,
  1142,
  119,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

# Load Pretrained model

In [257]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=9)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [258]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="/content/wandb/sentiment_transfer_learning_transformer/",
    logging_dir='/content/wandb/sentiment_transfer_learning_transformer/logs',
    logging_strategy='epoch',
    logging_steps=100,
    num_train_epochs=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    seed=2025,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)



# Eval Metric


> F1 Macro



In [259]:
pip install evaluate



In [260]:
# Model performance evaluation
import evaluate

In [261]:
# Number of evaluation modules
print(f'There are {len(evaluate.list_evaluation_modules())} evaluation models in Hugging Face.\n')

# List all evaluation metrics
evaluate.list_evaluation_modules()

There are 201 evaluation models in Hugging Face.



['codeparrot/apps_metric',
 'lvwerra/test',
 'angelina-wang/directional_bias_amplification',
 'cpllab/syntaxgym',
 'lvwerra/bary_score',
 'hack/test_metric',
 'yzha/ctc_eval',
 'mfumanelli/geometric_mean',
 'daiyizheng/valid',
 'erntkn/dice_coefficient',
 'mgfrantz/roc_auc_macro',
 'Vlasta/pr_auc',
 'gorkaartola/metric_for_tp_fp_samples',
 'idsedykh/metric',
 'idsedykh/codebleu2',
 'idsedykh/codebleu',
 'idsedykh/megaglue',
 'Vertaix/vendiscore',
 'GMFTBY/dailydialogevaluate',
 'GMFTBY/dailydialog_evaluate',
 'jzm-mailchimp/joshs_second_test_metric',
 'ola13/precision_at_k',
 'yulong-me/yl_metric',
 'abidlabs/mean_iou',
 'abidlabs/mean_iou2',
 'KevinSpaghetti/accuracyk',
 'NimaBoscarino/weat',
 'ronaldahmed/nwentfaithfulness',
 'Viona/infolm',
 'kyokote/my_metric2',
 'kashif/mape',
 'Ochiroo/rouge_mn',
 'leslyarun/fbeta_score',
 'anz2/iliauniiccocrevaluation',
 'zbeloki/m2',
 'xu1998hz/sescore',
 'dvitel/codebleu',
 'NCSOFT/harim_plus',
 'JP-SystemsX/nDCG',
 'sportlosos/sescore',
 'Dru

In [262]:
import numpy as np
def compute_metrics(eval_pred):
    metric = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="macro")


# Train model using transformer trainer

In [263]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,2.1532,1.97004,0.118182
2,1.7967,1.847153,0.27793
3,1.3081,1.801724,0.195833
4,0.9712,1.592734,0.361269
5,0.5729,1.510015,0.619792
6,0.3369,1.542427,0.578125
7,0.1956,1.557777,0.604167
8,0.106,1.935738,0.486111
9,0.0683,1.625772,0.583333
10,0.0537,1.766984,0.622619


TrainOutput(global_step=315, training_loss=0.5168305058327932, metrics={'train_runtime': 180.1151, 'train_samples_per_second': 23.318, 'train_steps_per_second': 5.83, 'total_flos': 20721297864960.0, 'train_loss': 0.5168305058327932, 'epoch': 15.0})

# Test

In [264]:
# Path to your JSON file
json_path = "/content/drive/MyDrive/SuperAI/test.json"

# Load JSON data into a Python list of dicts
with open(json_path, "r", encoding="utf-8") as f:
    test = json.load(f)

In [265]:
records = []
for entry in test:
    # Get all quotes from the "quotes" list
    quotes_text = []
    for q in entry["quotes"]:
        quotes_text.append(q["quote"])

    # Combine into one string (separated by space or newline)
    combined_quotes = " ".join(quotes_text)

    # Add a record to our list
    records.append({
        "text": combined_quotes,

    })
test_df = pd.DataFrame(records)
test_df

Unnamed: 0,text
0,That's the problem. I don't know how. I can't ...
1,"I wish I could explain it, Frank. I just lose ..."
2,I am mistaken I am not sure I can teach you. A...
3,"John, I need you to tell me what you need. I c..."
4,"I'm not hungry, Tiffany. You should eat someth..."
5,I can't fail. Do you understand? This is all ...
6,I want his eyelids! I guessed I kept them wait...
7,"I said we’d lose, you said we’d “do that toget..."
8,I can't focus on work. You're distracted. Mayb...
9,I can't get married - I'm a 30-year-old boy. W...


In [266]:
# test_df["label"] = test_df["label"].map(mapping_dict)

In [267]:
test_df.head()

Unnamed: 0,text
0,That's the problem. I don't know how. I can't ...
1,"I wish I could explain it, Frank. I just lose ..."
2,I am mistaken I am not sure I can teach you. A...
3,"John, I need you to tell me what you need. I c..."
4,"I'm not hungry, Tiffany. You should eat someth..."


In [268]:
hg_test_data = Dataset.from_pandas(test_df)
dataset_test = hg_test_data.map(tokenize_dataset)

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

In [269]:
# Predictions
y_test_predict = trainer.predict(dataset_test)

# Take a look at the predictions
y_test_predict[0]

array([[ 3.5499275e-01,  1.2230365e+00, -2.9269210e-01, -9.7007197e-01,
        -1.0558863e+00,  1.8499060e+00, -4.2934847e-01, -1.7643950e+00,
         2.7415106e-01],
       [ 1.8950680e-01, -1.2656978e+00,  2.5041161e-02, -8.3362490e-02,
         2.3099928e+00, -7.0387763e-01,  1.0396714e+00, -1.0115691e+00,
         1.3581443e-01],
       [-5.8377653e-01,  6.0341269e-01, -8.2731068e-01, -1.4328243e+00,
        -5.8213824e-01,  3.7265015e+00, -1.0319794e+00, -1.3956780e+00,
        -4.3595150e-02],
       [-1.6274382e-01,  3.3981624e-01, -5.9848398e-01, -1.2163571e+00,
        -4.5290199e-01,  3.1197684e+00, -5.1644975e-01, -1.5083919e+00,
         3.1491289e-01],
       [-3.1394327e-01, -1.3559934e+00, -2.3289624e-01, -3.4998506e-02,
         4.1285710e+00, -3.8424057e-01,  3.3060738e-01, -2.8590775e-01,
        -3.8328078e-01],
       [-6.6586632e-01,  7.4987382e-01, -6.8590194e-01, -1.5245447e+00,
        -8.0952120e-01,  3.7372794e+00, -9.5089209e-01, -1.3835448e+00,
         3.

In [270]:
# Predicted logits
y_test_logits = y_test_predict.predictions

# First 5 predicted probabilities
y_test_logits[:5]

array([[ 0.35499275,  1.2230365 , -0.2926921 , -0.970072  , -1.0558863 ,
         1.849906  , -0.42934847, -1.764395  ,  0.27415106],
       [ 0.1895068 , -1.2656978 ,  0.02504116, -0.08336249,  2.3099928 ,
        -0.7038776 ,  1.0396714 , -1.0115691 ,  0.13581443],
       [-0.58377653,  0.6034127 , -0.8273107 , -1.4328243 , -0.58213824,
         3.7265015 , -1.0319794 , -1.395678  , -0.04359515],
       [-0.16274382,  0.33981624, -0.598484  , -1.2163571 , -0.452902  ,
         3.1197684 , -0.51644975, -1.5083919 ,  0.3149129 ],
       [-0.31394327, -1.3559934 , -0.23289624, -0.03499851,  4.128571  ,
        -0.38424057,  0.33060738, -0.28590775, -0.38328078]],
      dtype=float32)

In [271]:
import tensorflow as tf
# Predicted probabilities
y_test_probabilities = tf.nn.softmax(y_test_logits)

# First 5 predicted logits
y_test_probabilities.shape

TensorShape([46, 9])

In [272]:
# Predicted labels
y_test_pred_labels = np.argmax(y_test_probabilities, axis=1)

# First 5 predicted probabilities
y_test_pred_labels.shape

(46,)

In [273]:
y_test_pred_labels

array([5, 4, 5, 5, 4, 5, 1, 5, 6, 1, 1, 1, 5, 8, 5, 5, 1, 2, 5, 1, 5, 6,
       5, 5, 5, 5, 1, 5, 2, 2, 6, 5, 5, 3, 1, 4, 5, 1, 5, 1, 4, 1, 1, 5,
       2, 5])

In [274]:
# Actual labels
y_test_actual_labels = y_test_predict.label_ids

# First 5 predicted probabilities
y_test_actual_labels

In [275]:
sub = '/content/drive/MyDrive/SuperAI/sample_submission.csv'

sub_file = pd.read_csv(sub)
sub_file.head()

Unnamed: 0,ID,Answer
0,1,2.0
1,2,6.0
2,3,7.0
3,4,
4,5,


In [276]:
ans = y_test_pred_labels.tolist()

In [None]:
# Map กลับ to original label
ans_final = [e+1 for e in ans]

In [278]:
df = pd.DataFrame({'ID': range(1,47), 'predictions': ans_final})

# Save file to submission

In [279]:
df.to_csv('submission.csv', index=False)