<a href="https://colab.research.google.com/github/Anujchobe/Anujkchobe/blob/main/LLM_Classification_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import json
import os
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split


In [5]:
class CFG:
    seeds = [42, 119, 2020, 2024, 2028]


In [6]:
train_df = pd.read_csv(f'/content/train.csv')
train_df.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [7]:
prompt_list = []
targets = []
for i in tqdm(range(len(train_df))):
    prompts = json.loads(train_df.iloc[i]["prompt"])
    response_a = json.loads(train_df.iloc[i]["response_a"])
    response_b = json.loads(train_df.iloc[i]["response_b"])
    conversation_a = ""
    conversation_b = ""
    for j in range(len(prompts)):
        if response_a[j] is None:
            response_a[j] = "None"
        if response_b[j] is None:
            response_b[j] = "None"
        conversation_a += prompts[j] + "\n"
        conversation_a += response_a[j] + "\n"
        conversation_b += prompts[j] + "\n"
        conversation_b += response_b[j] + "\n"
    prompt_list.append((conversation_a, conversation_b))
    if train_df.iloc[i]["winner_tie"] == 1:
        targets.append(0)
    if train_df.iloc[i]["winner_model_a"] == 1:
        targets.append(1)
    if train_df.iloc[i]["winner_model_b"] == 1:
        targets.append(2)
len(prompt_list)

  0%|          | 0/57477 [00:00<?, ?it/s]

57477

In [8]:
# Step 2: Define TextVectorization layer
vocab_size = 20000  # Vocabulary size (tune this as needed)
max_length = 1024    # Maximum sequence length (tune this as needed)
text_vectorizer = TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=max_length)
text_vectorizer.adapt([item[0] for item in prompt_list] + [item[1] for item in prompt_list])

In [9]:
def get_dataset(prompt_list, targets, shuffle=True, batch_size=128):
    part1 = [item[0] for item in prompt_list]
    part2 = [item[1] for item in prompt_list]
    dataset = tf.data.Dataset.from_tensor_slices(((part1, part2), targets))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [10]:
def get_base_model(inputs, embedding):
    x = text_vectorizer(inputs)
    x = embedding(x)
    return x
def get_model():
    inputs1 = tf.keras.Input(shape=(1,), dtype=tf.string)
    inputs2 = tf.keras.Input(shape=(1,), dtype=tf.string)
    embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64, mask_zero=True)
    x1 = get_base_model(inputs1, embedding)
    x2 = get_base_model(inputs2, embedding)
    x = tf.keras.layers.Concatenate()([x1, x2])
    x = tf.keras.layers.Conv1D(32, 3, activation="relu")(x)
    x = tf.keras.layers.Conv1D(32, 3, activation="relu")(x)
    x = tf.keras.layers.SpatialDropout1D(0.2)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.Conv1D(64, 3, activation="relu")(x)
    x = tf.keras.layers.Conv1D(64, 3, activation="relu")(x)
    x = tf.keras.layers.SpatialDropout1D(0.2)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation="swish")(x)
    outputs = tf.keras.layers.Dense(3, activation="softmax")(x)
    model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=outputs)

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [11]:
models = []
for seed in CFG.seeds:
    model_name = f"model_{seed}.keras"
    # Step 1: Split texts and labels into train and test sets
    train_texts, valid_texts, train_labels, valid_labels = train_test_split(
        prompt_list, targets, test_size=0.2, random_state=seed
    )
    valid_ds = get_dataset(valid_texts, valid_labels, shuffle=False)
    model_name_path = f"/kaggle/input/llm-classification-finetuning-with-cnn-model/{model_name}"
    if not os.path.exists(model_name_path):
        train_ds = get_dataset(train_texts, train_labels)
        model = get_model()
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=model_name,  # Filepath to save the best model
            monitor='val_loss',        # Metric to monitor
            mode="min",
            save_best_only=True,       # Save only the best model
            verbose=1
        )
        early_stopping_callback = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',        # Metric to monitor
            patience=5,                # Number of epochs with no improvement to wait before stopping
            verbose=1,
            restore_best_weights=True  # Restore weights from the best epoch
        )
        model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint_callback, early_stopping_callback])
        model.load_weights(model_name)
    else:
        model = tf.keras.models.load_model(model_name_path)
        model.save(model_name)
    loss, acc = model.evaluate(valid_ds, verbose=0)
    print(f"Validation Loss: {loss: .4f} Validation Accuracy: {acc * 100: .4f}%")
    models.append(model)

Epoch 1/30




[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.3858 - loss: 1.0853
Epoch 1: val_loss improved from inf to 1.05596, saving model to model_42.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 58ms/step - accuracy: 0.3859 - loss: 1.0853 - val_accuracy: 0.4591 - val_loss: 1.0560
Epoch 2/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 66ms/step - accuracy: 0.4543 - loss: 1.0537
Epoch 2: val_loss improved from 1.05596 to 1.04132, saving model to model_42.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 75ms/step - accuracy: 0.4544 - loss: 1.0536 - val_accuracy: 0.4699 - val_loss: 1.0413
Epoch 3/30
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4978 - loss: 1.0095
Epoch 3: val_loss did not improve from 1.04132
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 58ms/step - accuracy: 0.4978 - loss: 1.0094 - val_accuracy: 0.4540



[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3884 - loss: 1.0838
Epoch 1: val_loss improved from inf to 1.05951, saving model to model_119.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 59ms/step - accuracy: 0.3885 - loss: 1.0838 - val_accuracy: 0.4543 - val_loss: 1.0595
Epoch 2/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 54ms/step - accuracy: 0.4540 - loss: 1.0593
Epoch 2: val_loss improved from 1.05951 to 1.05248, saving model to model_119.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 61ms/step - accuracy: 0.4540 - loss: 1.0593 - val_accuracy: 0.4673 - val_loss: 1.0525
Epoch 3/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.4880 - loss: 1.0216
Epoch 3: val_loss did not improve from 1.05248
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 55ms/step - accuracy: 0.4881 - loss: 1.0215 - val_accuracy: 0.46



[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.3852 - loss: 1.0850
Epoch 1: val_loss improved from inf to 1.06000, saving model to model_2020.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 59ms/step - accuracy: 0.3853 - loss: 1.0850 - val_accuracy: 0.4446 - val_loss: 1.0600
Epoch 2/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 52ms/step - accuracy: 0.4556 - loss: 1.0537
Epoch 2: val_loss did not improve from 1.06000
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 58ms/step - accuracy: 0.4556 - loss: 1.0537 - val_accuracy: 0.4588 - val_loss: 1.0633
Epoch 3/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 51ms/step - accuracy: 0.5105 - loss: 0.9955
Epoch 3: val_loss did not improve from 1.06000
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 59ms/step - accuracy: 0.5106 - loss: 0.9953 - val_accuracy: 0.4406 - val_loss: 1.1041
Epoch 4/30
[1



[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.4019 - loss: 1.0802
Epoch 1: val_loss improved from inf to 1.06075, saving model to model_2024.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 61ms/step - accuracy: 0.4020 - loss: 1.0802 - val_accuracy: 0.4436 - val_loss: 1.0608
Epoch 2/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 53ms/step - accuracy: 0.4678 - loss: 1.0471
Epoch 2: val_loss improved from 1.06075 to 1.05185, saving model to model_2024.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 60ms/step - accuracy: 0.4678 - loss: 1.0470 - val_accuracy: 0.4583 - val_loss: 1.0519
Epoch 3/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 57ms/step - accuracy: 0.5234 - loss: 0.9839
Epoch 3: val_loss did not improve from 1.05185
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 62ms/step - accuracy: 0.5235 - loss: 0.9838 - val_accuracy: 0.



Epoch 1/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 52ms/step - accuracy: 0.3918 - loss: 1.0851
Epoch 1: val_loss improved from inf to 1.05935, saving model to model_2028.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 60ms/step - accuracy: 0.3920 - loss: 1.0851 - val_accuracy: 0.4486 - val_loss: 1.0593
Epoch 2/30
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.4552 - loss: 1.0555
Epoch 2: val_loss improved from 1.05935 to 1.04888, saving model to model_2028.keras
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 63ms/step - accuracy: 0.4552 - loss: 1.0554 - val_accuracy: 0.4601 - val_loss: 1.0489
Epoch 3/30
[1m359/360[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 50ms/step - accuracy: 0.5049 - loss: 1.0049
Epoch 3: val_loss did not improve from 1.04888
[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 55ms/step - accuracy: 0.5050 - loss: 1.0047 - val_a

In [12]:
test_df = pd.read_csv(f'/content/test.csv')

In [13]:
test_prompt_list = []
for i in tqdm(range(len(test_df))):
    prompts = json.loads(test_df.iloc[i]["prompt"])
    response_a = json.loads(test_df.iloc[i]["response_a"])
    response_b = json.loads(test_df.iloc[i]["response_b"])
    conversation_a = ""
    conversation_b = ""
    for j in range(len(prompts)):
        if response_a[j] is None:
            response_a[j] = "None"
        if response_b[j] is None:
            response_b[j] = "None"
        conversation_a += prompts[j] + "\n"
        conversation_a += response_a[j] + "\n"
        conversation_b += prompts[j] + "\n"
        conversation_b += response_b[j] + "\n"
    test_prompt_list.append((conversation_a, conversation_b))
len(test_prompt_list)

  0%|          | 0/3 [00:00<?, ?it/s]

3

In [14]:
def get_test_dataset(prompt_list, batch_size=128):
    part1 = [item[0] for item in prompt_list]
    part2 = [item[1] for item in prompt_list]
    dataset = tf.data.Dataset.from_tensor_slices(((part1, part2), [0] * len(prompt_list)))
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [16]:
# Call the function to create the test dataset using your test_prompt_list:
test_ds = get_test_dataset(test_prompt_list)

In [17]:
result = np.mean([model.predict(test_ds, verbose=0) for model in models], axis=0)



In [19]:
submission = pd.read_csv("/content/sample_submission.csv")
submission["winner_tie"] = result[:, 0]
submission["winner_model_a"] = result[:, 1]
submission["winner_model_b"] = result[:, 2]
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.299958,0.309517,0.390525
1,211333,0.434367,0.25903,0.306603
2,1233961,0.393473,0.363816,0.24271
