In [10]:
import os

os.environ["KERAS_BACKEND"] = "torch"  # or "tensorflow" or "torch"

import keras_nlp
import keras
import tensorflow as tf

import numpy as np
import pandas as pd
from tqdm import tqdm
import json

import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.express as px

ModuleNotFoundError: No module named 'keras_nlp'

In [2]:
print("TensorFlow:", tf.__version__)
print("Keras:", keras.__version__)
print("KerasNLP:", keras_nlp.__version__)

NameError: name 'tf' is not defined

In [8]:
class CFG:
    """
        随机种子，42 这个数字因为《银河系漫游指南》这本科幻小说而变得特别有名。
        在小说中，42 被描述为“生命、宇宙以及一切的终极答案”。
        由于这个原因，42 被广泛采用，成为一种“文化现象”或者一种内在的幽默，尤其在科技和编程社区中。
    """
    seed = 42
    preset = "deberta_v3_extra_small_en"  # 模型名称
    sequence_length = 512  # 输入序列长度
    epochs = 3
    batch_size = 16
    scheduler = 'cosine'  # 学习率调度器
    label2name = {0: 'winner_model_a', 1: 'winner_model_b', 2: 'winner_tie'}
    name2label = {v: k for k, v in label2name.items()}
    class_labels = list(label2name.keys())
    class_names = list(label2name.values())

In [None]:
keras.utils.set_random_seed(CFG.seed)

In [10]:
keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
BASE_PATH = './lmsys-chatbot-arena'

In [None]:
df = pd.read_csv(f'{BASE_PATH}/train.csv')

model_indicators = ['gpt-4', 'gpt-3.5', 'mistral', 'llama', 'koala']

# 定义一个函数来检测response中是否包含模型名称
def contains_model_name(response):
    for indicator in model_indicators:
        if indicator.lower() in response.lower():
            return True
    return False

# 过滤掉包含模型名称的行
df_filtered = df[~df['response_a'].apply(contains_model_name) & ~df['response_b'].apply(contains_model_name)]

# 显示过滤后的数据
df_filtered.head()

# 保存清洗后的数据
df_filtered.to_csv('cleaned_train.csv', index=False)

In [17]:
df = pd.read_csv(f'{BASE_PATH}/cleaned_train.csv')

# 获取第一个提示及其相关响应
df["prompt"] = df.prompt.map(lambda x: eval(x)[0])
df["response_a"] = df.response_a.map(lambda x: eval(x.replace("null", "''"))[0])  # 若response为null，转为空字符串，否则会报错
df["response_b"] = df.response_b.map(lambda x: eval(x.replace("null", "''"))[0])

# 将值最大的列名（获胜的模型名）作为class_name
df["class_name"] = df[["winner_model_a", "winner_model_b", "winner_tie"]].idxmax(axis=1)
# 将模型名转为数字0，1，2
df["class_label"] = df.class_name.map(CFG.name2label)

df.head()

In [None]:
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')

test_df["prompt"] = test_df.prompt.map(lambda x: eval(x)[0])
test_df["response_a"] = test_df.response_a.map(lambda x: eval(x.replace("null", "''"))[0])
test_df["response_b"] = test_df.response_b.map(lambda x: eval(x.replace("null", "''"))[0])

test_df.head()

In [None]:
# 基于prompt和response生成选项option
def make_pairs(row):
    row["encode_fail"] = False
    try:
        prompt = row.prompt.encode("utf-8").decode("utf-8")
    except:
        prompt = ""
        row["encode_fail"] = True

    try:
        response_a = row.response_a.encode("utf-8").decode("utf-8")
    except:
        response_a = ""
        row["encode_fail"] = True

    try:
        response_b = row.response_b.encode("utf-8").decode("utf-8")
    except:
        response_b = ""
        row["encode_fail"] = True

    row['options'] = [f"Prompt: {prompt}\n\nResponse: {response_a}", 
                      f"Prompt: {prompt}\n\nResponse: {response_b}"
                      ]
    return row

In [None]:
df = df.apply(make_pairs, axis=1)  # 对df的每行数据使用make_pairs函数
display(df.head(2))

test_df = test_df.apply(make_pairs, axis=1)
display(test_df.head(2))

In [None]:
model_df = pd.concat([df.model_a, df.model_b])
counts = model_df.value_counts().reset_index()
counts.columns = ['LLM', 'Count']

fig = px.bar(counts, x='LLM', y='Count',
             title='Distribution of LLMs',
             color='Count', color_continuous_scale='viridis')

fig.update_layout(xaxis_tickangle=-45)

fig.show()

In [None]:
counts = df['class_name'].value_counts().reset_index()
counts.columns = ['Winner', 'Win Count']

fig = px.bar(counts, x='Winner', y='Win Count',
             title='Winner distribution for Train Data',
             labels={'Winner': 'Winner', 'Win Count': 'Win Count'},
             color='Winner', color_continuous_scale='viridis')

fig.update_layout(xaxis_title="Winner", yaxis_title="Win Count")

fig.show()

In [None]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df["class_label"])

In [None]:
preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
    preset=CFG.preset,  # 模型名
    sequence_length=CFG.sequence_length,  # 最大序列长度，填充空白
)

In [None]:
outs = preprocessor(df.options.iloc[0])  # 处理第一行的option

# 显示每个经处理的输出的形状
for k, v in outs.items():
    print(k, ":", v.shape)

In [None]:
def preprocess_fn(text, label=None):
    text = preprocessor(text)  # 预处理文本
    return (text, label) if label is not None else text

In [None]:
def build_dataset(texts, labels=None, batch_size=32,
                  cache=True, shuffle=1024):
    AUTO = tf.data.AUTOTUNE  # AUTOTUNE option
    slices = (texts,) if labels is None else (texts, keras.utils.to_categorical(labels, num_classes=3))  # Create slices
    ds = tf.data.Dataset.from_tensor_slices(slices)  #从切片生成数据集
    ds = ds.cache() if cache else ds
    ds = ds.map(preprocess_fn, num_parallel_calls=AUTO)
    opt = tf.data.Options()  # 生成数据集选项
    if shuffle:
        ds = ds.shuffle(shuffle, seed=CFG.seed)
        opt.experimental_deterministic = False
    ds = ds.with_options(opt)  # 设置数据集选项
    ds = ds.batch(batch_size, drop_remainder=False)
    ds = ds.prefetch(AUTO)
    return ds

In [None]:
train_texts = train_df.options.tolist()  # 提取训练文本
train_labels = train_df.class_label.tolist()  # 提取训练标签

print(train_texts)
print(train_labels)

train_ds = build_dataset(train_texts, train_labels,
                         batch_size=CFG.batch_size,
                         shuffle=True)

valid_texts = valid_df.options.tolist()  # 提取有效文本
valid_labels = valid_df.class_label.tolist()  # 提取有效标签
valid_ds = build_dataset(valid_texts, valid_labels,
                         batch_size=CFG.batch_size,
                         shuffle=False)

In [None]:
import math


def get_lr_callback(batch_size=8, mode='cos', epochs=10, plot=False):
    lr_start, lr_max, lr_min = 1.0e-6, 0.6e-6 * batch_size, 1e-6
    lr_ramp_ep, lr_sus_ep, lr_decay = 2, 0, 0.8

    def lrfn(epoch):  # 学习率更新
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
        elif mode == 'exp':
            lr = (lr_max - lr_min) * lr_decay ** (epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step':
            lr = lr_max * lr_decay ** ((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:  # 绘制学习率曲线
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)], marker='o')
        plt.xlabel('epoch')
        plt.ylabel('lr')
        plt.title('LR Scheduler')
        plt.show()

    return keras.callbacks.LearningRateScheduler(lrfn, verbose=False)  # 生成lr反馈

In [None]:
lr_cb = get_lr_callback(CFG.batch_size, plot=True)

In [31]:
ckpt_cb = keras.callbacks.ModelCheckpoint(f'best_model.weights.h5',
                                          monitor='val_log_loss',
                                          save_best_only=True,
                                          save_weights_only=True,
                                          mode='min')  # 获取模型checkpoints反馈

In [32]:
log_loss = keras.metrics.CategoricalCrossentropy(name="log_loss")

In [33]:
# 定义输入层
inputs = {
    "token_ids": keras.Input(shape=(2, None), dtype=tf.int32, name="token_ids"),
    "padding_mask": keras.Input(shape=(2, None), dtype=tf.int32, name="padding_mask"),
}
backbone = keras_nlp.models.DebertaV3Backbone.from_preset(
    CFG.preset,
)

# 使用主干网为第一个response计算嵌入(P + R_A)
response_a = {k: v[:, 0, :] for k, v in inputs.items()}
embed_a = backbone(response_a)

response_b = {k: v[:, 1, :] for k, v in inputs.items()}
embed_b = backbone(response_b)

# 计算最终的输出
embeds = keras.layers.Concatenate(axis=-1)([embed_a, embed_b])
embeds = keras.layers.GlobalAveragePooling1D()(embeds)
outputs = keras.layers.Dense(3, activation="softmax", name="classifier")(embeds)
model = keras.Model(inputs, outputs)

model.compile(
    optimizer=keras.optimizers.Adam(5e-6),
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.02),
    metrics=[
        log_loss,
        keras.metrics.CategoricalAccuracy(name="accuracy"),
    ],
)

In [34]:
model.summary()

In [None]:

# 模型训练开始
history = model.fit(
    train_ds,
    epochs=CFG.epochs,
    validation_data=valid_ds,
    callbacks=[lr_cb, ckpt_cb]
)

In [None]:

model.load_weights('/kaggle/working/best_model.weights.h5')

In [None]:

# 构建测试数据集
test_texts = test_df.options.tolist()
test_ds = build_dataset(test_texts,
                        batch_size=min(len(test_df), CFG.batch_size),
                        shuffle=False)

In [None]:

# 对预测测试集数据
test_preds = model.predict(test_ds, verbose=1)

In [None]:
sub_df = test_df[["id"]].copy()
sub_df[CFG.class_names] = test_preds.tolist()
sub_df.to_csv("submission.csv", index=False)
sub_df.head()