In [2]:
import json
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2

# Change the working directory to the root of the GitHub repository
notebook_dir = os.getcwd()
if "ranking" in notebook_dir:
    os.chdir(os.path.dirname(os.path.abspath(notebook_dir)))
    print(f"Changed working directory to: {os.getcwd()}")

# Load JSONL files
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

# File paths
train_file_path = 'ranking_dataset/mistral-base-train-1108935.jsonl'
test_file_path = 'ranking_dataset/mistral-base-test-1096727.jsonl'

# Load data
train_data = load_jsonl(train_file_path)
test_data = load_jsonl(test_file_path)

# Prepare train, validation, and test splits
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Print dataset sizes
print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")
# Display all the keys in the dataset
print("Keys in data:", train_data[0].keys())
print("keys in candidates:", train_data[0]['candidates'][0].keys())

print("Keys in data:", test_data[0].keys())
print("keys in candidates:", test_data[0]['candidates'][0].keys())

### change a key.
# Update the key 'input' to 'input_token' in test_data
for entry in test_data:
    if 'input' in entry:
        entry['input_token'] = entry.pop('input')


Changed working directory to: /Users/bowensong/Github/CoT-Rec
Training data size: 5856
Validation data size: 1464
Test data size: 1319
Keys in data: dict_keys(['candidates', 'input_token', 'attention_mask', 'answer_scores', 'answer', 'is_correct'])
keys in candidates: dict_keys(['text', 'answer', 'answer_span', 'score', 'output_tokens'])
Keys in data: dict_keys(['candidates', 'input_token', 'attention_mask', 'answer_scores', 'answer', 'is_correct'])
keys in candidates: dict_keys(['text', 'answer', 'answer_span', 'score', 'output_tokens'])


In [17]:
def format_dataset(data):
    formatted_dataset = {
        "input_tokens": [],
        "output_tokens": [],
        "scores": []
    }
    for entry in data:
        input_tokens = entry['input_token']
        candidates = entry['candidates']
        answer = entry['answer']

        output_tokens_list = [candidate['output_tokens'] for candidate in candidates]
        scores_list = [1 if candidate['answer'] == answer else 0 for candidate in candidates]

        formatted_dataset["input_tokens"].append(input_tokens)
        formatted_dataset["output_tokens"].append(output_tokens_list)
        formatted_dataset["scores"].append(scores_list)

    return pd.DataFrame(formatted_dataset)

df_train = format_dataset(train_data)
df_validation = format_dataset(val_data)
df_test = format_dataset(test_data)

# Example output
print(f"Example formatted dataset entry: {df_train.iloc[0]}")
print(f"Total formatted dataset size: {len(df_train)}")

print(df_train['scores'].tolist())

Example formatted dataset entry: input_tokens     [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
output_tokens    [[28705, 28770, 28734, 28734, 13, 13, 28824, 2...
scores           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
Name: 0, dtype: object
Total formatted dataset size: 5856
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [36]:
from ranking.xgboost_ranker import XGBoostRanker,RankingMetrics

# Initialize the XGBoost ranker
ranker = XGBoostRanker(objective='rank:map',eval_metric='map@1')
# Train the model
ranker.train(
    df_train['input_tokens'].tolist(),
    df_train['output_tokens'].tolist(),
    df_train['scores'].tolist(),
    (df_validation['input_tokens'].tolist(),
    df_validation['output_tokens'].tolist(),
    df_validation['scores'].tolist())
)
# Evaluate the model
predictions = ranker.predict(
    df_test['input_tokens'].tolist(),
    df_test['output_tokens'].tolist()
)

true_scores_flat = [score for group in df_test['scores'] for score in group]
predictions_flat = [pred for group in predictions for pred in group]
metrics = RankingMetrics.calculate_metrics(true_scores_flat, predictions_flat)

print("Metrics:", metrics)
acc = [entry['candidates'][np.argmax(pred)]['answer'] == entry['answer'] for pred, entry in zip(predictions, test_data)]
print("Accuracy:", np.mean(acc))

Using CPU for training.
[0]	validation-map@1:0.30874
[1]	validation-map@1:0.32514
[2]	validation-map@1:0.32514
[3]	validation-map@1:0.33402
[4]	validation-map@1:0.33607
[5]	validation-map@1:0.34085
[6]	validation-map@1:0.34426
[7]	validation-map@1:0.34016
[8]	validation-map@1:0.34221
[9]	validation-map@1:0.34563
[10]	validation-map@1:0.34904
[11]	validation-map@1:0.33675
[12]	validation-map@1:0.34563
[13]	validation-map@1:0.34563
[14]	validation-map@1:0.34631
[15]	validation-map@1:0.34973
[16]	validation-map@1:0.34904
[17]	validation-map@1:0.34973
[18]	validation-map@1:0.34085
[19]	validation-map@1:0.33607
[20]	validation-map@1:0.33402
[21]	validation-map@1:0.33333
[22]	validation-map@1:0.33060
[23]	validation-map@1:0.33607
[24]	validation-map@1:0.33197
[25]	validation-map@1:0.33333
[26]	validation-map@1:0.33402
[27]	validation-map@1:0.33675
[28]	validation-map@1:0.33811
[29]	validation-map@1:0.34153
[30]	validation-map@1:0.34085
[31]	validation-map@1:0.33811
[32]	validation-map@1:0.34

In [34]:
param_grid = {
    "eta": [0.01, 0.1],
    "max_depth": [3, 6, 10],
    "min_child_weight": [1, 3],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "lambda": [1, 5],
    "alpha": [0, 1],
    "objective": ["rank:pairwise", "rank:map"],
    "eval_metric": ["map@1"]
}



ranker = XGBoostRanker()
ranker,_ = ranker.hyperparameter_search(
        (df_train['input_tokens'].tolist(),
    df_train['output_tokens'].tolist(),
    df_train['scores'].tolist())
    , (df_validation['input_tokens'].tolist(),
    df_validation['output_tokens'].tolist(),
    df_validation['scores'].tolist())
    , param_grid)

Using CPU for training.

Trying params: {'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'lambda': 1, 'alpha': 0, 'objective': 'rank:pairwise', 'eval_metric': 'map@1'}
[0]	validation-map@1:0.28210
[1]	validation-map@1:0.29850
[2]	validation-map@1:0.30055
[3]	validation-map@1:0.31284
[4]	validation-map@1:0.31831
[5]	validation-map@1:0.31831
[6]	validation-map@1:0.32445
[7]	validation-map@1:0.31762
[8]	validation-map@1:0.31831
[9]	validation-map@1:0.32445
[10]	validation-map@1:0.32855
[11]	validation-map@1:0.32923
[12]	validation-map@1:0.32992
[13]	validation-map@1:0.33197
[14]	validation-map@1:0.33470
[15]	validation-map@1:0.33060
[16]	validation-map@1:0.33128
[17]	validation-map@1:0.33128
[18]	validation-map@1:0.33128
[19]	validation-map@1:0.33060
[20]	validation-map@1:0.32992
[21]	validation-map@1:0.33402
[22]	validation-map@1:0.33333
[23]	validation-map@1:0.33675
[24]	validation-map@1:0.33675
[25]	validation-map@1:0.33675
[26]	validation

TypeError: 'int' object is not subscriptable

In [37]:
from cot_dataset.cot_decoding.task import GSMTask
from collections import Counter

task = GSMTask(encode_format='qa')
max_prob = []
max_vote = []
max_probsum = []
top_recomm = []

for top_k in [1, 5, 10, 20, 50]:
    predictions = ranker.predict(
        df_test['input_tokens'].tolist(),
        [entry[:top_k] for entry in df_test['output_tokens']],
    )

    for entry, pred in zip(test_data,predictions):
        scores = []
        for can in entry['candidates'][:top_k]:
            scores.append((can['answer'],can['score']))
        max_prob.append(sorted(scores, key=lambda x: x[1], reverse=True)[0][0] == entry['answer'])

        # Count the occurrences of each answer, if there is a tie, take the one with the highest score
        votes = Counter([can['answer'] for can in entry['candidates'][:top_k]])
        most_common_vote = votes.most_common(1)[0]
        if most_common_vote[1] == 1:
            max_vote.append(sorted(scores, key=lambda x: x[1], reverse=True)[0][0] == entry['answer'])
        else:
            max_vote.append(most_common_vote[0] == entry['answer'])

        # Sum the scores for each answer
        score_sums = {}
        for can in entry['candidates'][:top_k]:
            score_sums[can['answer']] = score_sums[can['answer']]+can['score'] if can['answer'] in score_sums else can['score']
        max_probsum.append(max(score_sums, key=score_sums.get) == entry['answer'])

        top_recomm.append(entry['candidates'][np.argmax(pred)]['answer'] == entry['answer'])

    print(f"Top {top_k} recommendations:")
    # Calculate the accuracy of the top recommendation
    accuracy = sum(top_recomm) / len(top_recomm)
    print(f"Accuracy of the top recommendation: {accuracy:.2f}")
    # Calculate the accuracy of the max probability
    accuracy_max_prob = sum(max_prob) / len(max_prob)
    print(f"Accuracy of the max probability: {accuracy_max_prob:.2f}")
    # Calculate the accuracy of the max vote
    accuracy_max_vote = sum(max_vote) / len(max_vote)
    print(f"Accuracy of the max vote: {accuracy_max_vote:.2f}") 
    # Calculate the accuracy of the max probability sum
    accuracy_max_probsum = sum(max_probsum) / len(max_probsum)
    print(f"Accuracy of the max probability sum: {accuracy_max_probsum:.2f}")

Top 1 recommendations:
Accuracy of the top recommendation: 0.23
Accuracy of the max probability: 0.23
Accuracy of the max vote: 0.23
Accuracy of the max probability sum: 0.23
Top 5 recommendations:
Accuracy of the top recommendation: 0.26
Accuracy of the max probability: 0.26
Accuracy of the max vote: 0.28
Accuracy of the max probability sum: 0.30
Top 10 recommendations:
Accuracy of the top recommendation: 0.26
Accuracy of the max probability: 0.27
Accuracy of the max vote: 0.30
Accuracy of the max probability sum: 0.36
Top 20 recommendations:
Accuracy of the top recommendation: 0.27
Accuracy of the max probability: 0.28
Accuracy of the max vote: 0.34
Accuracy of the max probability sum: 0.43
Top 50 recommendations:
Accuracy of the top recommendation: 0.29
Accuracy of the max probability: 0.27
Accuracy of the max vote: 0.38
Accuracy of the max probability sum: 0.55
