In [45]:
import json
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2

# Change the working directory to the root of the GitHub repository
notebook_dir = os.getcwd()
if "ranking" in notebook_dir:
    os.chdir(os.path.dirname(os.path.abspath(notebook_dir)))
    print(f"Changed working directory to: {os.getcwd()}")

# Load JSONL files
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

# File paths
train_file_path = 'ranking_dataset/mistral-base-train-1108935.jsonl'
test_file_path = 'ranking_dataset/mistral-base-test-1096727.jsonl'

# Load data
train_data = load_jsonl(train_file_path)
test_data = load_jsonl(test_file_path)

# Prepare train, validation, and test splits
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Print dataset sizes
print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")
# Display all the keys in the dataset
print("Keys in data:", train_data[0].keys())
print("keys in candidates:", train_data[0]['candidates'][0].keys())

print("Keys in data:", test_data[0].keys())
print("keys in candidates:", test_data[0]['candidates'][0].keys())

### change a key.
# Update the key 'input' to 'input_token' in test_data
for entry in test_data:
    if 'input' in entry:
        entry['input_token'] = entry.pop('input')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Training data size: 5856
Validation data size: 1464
Test data size: 1319
Keys in data: dict_keys(['candidates', 'input_token', 'attention_mask', 'answer_scores', 'answer', 'is_correct'])
keys in candidates: dict_keys(['text', 'answer', 'answer_span', 'score', 'output_tokens'])
Keys in data: dict_keys(['candidates', 'input_token', 'attention_mask', 'answer_scores', 'answer', 'is_correct'])
keys in candidates: dict_keys(['text', 'answer', 'answer_span', 'score', 'output_tokens'])


In [46]:
def format_dataset(data):
    formatted_dataset = {
        "input_tokens": [],
        "output_tokens": [],
        "scores": []
    }
    for entry in data:
        input_tokens = entry['input_token']
        candidates = entry['candidates']
        answer = entry['answer']

        output_tokens_list = [candidate['output_tokens'] for candidate in candidates]
        scores_list = [1 if candidate['answer'] == answer else 0 for candidate in candidates]

        formatted_dataset["input_tokens"].append(input_tokens)
        formatted_dataset["output_tokens"].append(output_tokens_list)
        formatted_dataset["scores"].append(scores_list)

    return pd.DataFrame(formatted_dataset)

df_train = format_dataset(train_data)
df_validation = format_dataset(val_data)
df_test = format_dataset(test_data)

# Example output
print(f"Example formatted dataset entry: {df_train.iloc[0]}")
print(f"Total formatted dataset size: {len(df_train)}")



Example formatted dataset entry: input_tokens     [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
output_tokens    [[28705, 28770, 28734, 28734, 13, 13, 28824, 2...
scores           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
Name: 0, dtype: object
Total formatted dataset size: 5856


In [None]:
from ranking.xgboost_ranker import XGBoostRanker,RankingMetrics

# Initialize the XGBoost ranker
ranker = XGBoostRanker()
# Train the model
ranker.train(
    df_train['input_tokens'].tolist(),
    df_train['output_tokens'].tolist(),
    df_train['scores'].tolist(),
    (df_validation['input_tokens'].tolist(),
    df_validation['output_tokens'].tolist(),
    df_validation['scores'].tolist())
)
# Evaluate the model
predictions = ranker.predict(
    df_test['input_tokens'].tolist(),
    df_test['output_tokens'].tolist()
)

true_scores_flat = [score for group in df_test['scores'] for score in group]
predictions_flat = [pred for group in predictions for pred in group]
metrics = RankingMetrics.calculate_metrics(true_scores_flat, predictions_flat)

print("Metrics:", metrics)


ImportError: cannot import name 'hyperparameter_search' from 'ranking.xgboost_ranker' (/Users/bowensong/Github/CoT-Rec/ranking/xgboost_ranker.py)

In [None]:
from cot_dataset.cot_decoding.task import GSMTask
task = GSMTask(encode_format='qa')
correct = []
original = []
for entry, pred in zip(test_data,predictions):
    # print(entry['candidates'][np.argmax(pred)]['answer'], entry['answer'], entry['candidates'][0].keys())
    correct.append(entry['candidates'][np.argmax(pred)]['answer'] == entry['answer'])
    original.append(entry['candidates'][0]['answer'] == entry['answer'])

print(sum(correct) / len(correct))
print(sum(original) / len(original))

0.3078089461713419
0.23351023502653526


In [None]:
param_grid = {
    "eta": [0.1, 0.3],
    "max_depth": [3, 5, 10,],
    "min_child_weight": [1, 3],
    "subsample": [0.8, 1.0],
    "objective": ["rank:pairwise", "rank:ndcg", "rank:map"],
    "eval_metric": ["map@1", "ndcg@1"]
}



ranker = XGBoostRanker()
ranker.hyperparameter_search(
        (df_train['input_tokens'].tolist(),
    df_train['output_tokens'].tolist(),
    df_train['scores'].tolist())
    , (df_validation['input_tokens'].tolist(),
    df_validation['output_tokens'].tolist(),
    df_validation['scores'].tolist())
    , param_grid)

Using CPU for training.

Trying params: {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8, 'objective': 'rank:pairwise', 'eval_metric': 'map@1'}
[0]	validation-map@1:0.28279
[1]	validation-map@1:0.30533
[2]	validation-map@1:0.30601
[3]	validation-map@1:0.31079
[4]	validation-map@1:0.30396
[5]	validation-map@1:0.31626
[6]	validation-map@1:0.32240
[7]	validation-map@1:0.32240
[8]	validation-map@1:0.31148
[9]	validation-map@1:0.31626
Validation NDCG: 0.8695

Trying params: {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8, 'objective': 'rank:pairwise', 'eval_metric': 'ndcg@1'}
[0]	validation-ndcg@1:0.28279
[1]	validation-ndcg@1:0.30533
[2]	validation-ndcg@1:0.30601
[3]	validation-ndcg@1:0.31079
[4]	validation-ndcg@1:0.30396
[5]	validation-ndcg@1:0.31626
[6]	validation-ndcg@1:0.32240
[7]	validation-ndcg@1:0.32240
[8]	validation-ndcg@1:0.31148
[9]	validation-ndcg@1:0.31626
Validation NDCG: 0.8695

Trying params: {'eta': 0.1, 'max_depth': 3, 'min_child_w

In [None]:
# Evaluate the model
predictions = ranker.predict(
    df_test['input_tokens'].tolist(),
    df_test['output_tokens'].tolist()
)

true_scores_flat = [score for group in df_test['scores'] for score in group]
predictions_flat = [pred for group in predictions for pred in group]
metrics = RankingMetrics.calculate_metrics(true_scores_flat, predictions_flat)

print("Metrics:", metrics)
from cot_dataset.cot_decoding.task import GSMTask
task = GSMTask(encode_format='qa')
correct = []
original = []
for entry, pred in zip(test_data,predictions):
    # print(entry['candidates'][np.argmax(pred)]['answer'], entry['answer'], entry['candidates'][0].keys())
    correct.append(entry['candidates'][np.argmax(pred)]['answer'] == entry['answer'])
    original.append(entry['candidates'][0]['answer'] == entry['answer'])

print(sum(correct) / len(correct))
print(sum(original) / len(original))