In [1]:
import pandas as pd
import numpy as np


In [2]:
from copy import deepcopy
def sanitize_df_new(df):
    df = deepcopy(df)
    print(f"df shape before sanitize: {df.shape}")

    df = df[
        ~(df['prompt'].isna() | df['A'].isna() | df['B'].isna() | df['C'].isna() | df['D'].isna() | df['E'].isna())
    ].copy()

    df['valid_answer'] = df['answer'].apply(lambda x: x in ['A', 'B', 'C', 'D', 'E'])
    df = df[df['valid_answer']].copy()
    df = df.drop(columns=['valid_answer']).copy()
    df = df.reset_index(drop=True)

    df = df.reset_index(drop=True)
    print(f"df shape after sanitize: {df.shape}")

    return df

# 这个shuffle_answer_key函数的主要目的是在数据预处理过程中打乱（shuffle）每个问题的选项顺序
# ，同时保证答案（answer）标签的正确性不变。这是为了确保模型在学习时不会依赖于选项的固定顺序
def shuffle_answer_key(df):
    shuffled_df = deepcopy(df)
    # print_line()
    print(f"Answer Key Distribution Before Shuffling: {shuffled_df.answer.value_counts().sort_index()}")

    key2idx = {v: k for k, v in enumerate(list("ABCDE"))}
    idx2key = {v: k for k, v in key2idx.items()}

    shuffled_df["answer_string"] = shuffled_df[["A", "B", "C", "D", "E", "answer"]].apply(
        lambda x: x[key2idx[x[-1]]], axis=1
    )

    shuffled_df["options"] = shuffled_df[["A", "B", "C", "D", "E"]].apply(
        lambda x: random.sample(list(x), len(x)), axis=1
    )

    shuffled_df["A"] = shuffled_df["options"].apply(lambda x: x[0])
    shuffled_df["B"] = shuffled_df["options"].apply(lambda x: x[1])
    shuffled_df["C"] = shuffled_df["options"].apply(lambda x: x[2])
    shuffled_df["D"] = shuffled_df["options"].apply(lambda x: x[3])
    shuffled_df["E"] = shuffled_df["options"].apply(lambda x: x[4])

    shuffled_df["answer"] = shuffled_df[["A", "B", "C", "D", "E", "answer_string"]].apply(
        lambda x: idx2key[[idx for idx in range(5) if x[idx] == x[-1]][0]], axis=1
    )

    shuffled_df = shuffled_df[df.columns].copy()
    shuffled_df = shuffled_df.reset_index(drop=True)

    print(f"Answer Key Distribution After Shuffling: {shuffled_df.answer.value_counts().sort_index()}")
    return shuffled_df


In [4]:
import pandas as pd
import json
from typing import Dict
import random

def load_and_prepare_data(cfg: Dict):
    """
    Load training and validation datasets and their support information,
    then prepare them for fine-tuning a language model.
    
    Parameters:
    cfg (Dict): Configuration dictionary containing paths to datasets.
    
    Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: Prepared training and validation DataFrames.
    """


    train_df = pd.read_csv(cfg['train_dataset_path'])
    print(f"shape of train data: {train_df.shape}")
    
    # Sanitize and shuffle (if these functions are defined elsewhere, make sure to import them)
    train_df = sanitize_df_new(train_df)
    train_df = shuffle_answer_key(train_df)

    # Load support information for training data
    with open(cfg['train_support_path'], 'r') as f:
        support_dict = json.load(f)
    train_df['support'] = train_df['id'].map(support_dict)
    assert train_df['support'].isna().sum() == 0, "Support is missing/invalid in training data."

    # ------- Load and prepare validation data -------------------------------------------#
    valid_df = pd.read_csv(cfg['valid_dataset_path'])
    valid_df = valid_df[['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer']].copy()
    valid_df['id'] = valid_df['id'].astype(str)

    # Load support information for validation data
    with open(cfg['valid_support_path'], 'r') as f:
        support_dict = json.load(f)
    valid_df['support'] = valid_df['id'].map(support_dict)
    assert valid_df['support'].isna().sum() == 0, "Support is missing/invalid in validation data."

    print(f"shape of train data: {train_df.shape}")
    print(f"shape of valid data: {valid_df.shape}")

    # print_line()
    train_df = train_df.rename(columns={"id": "question_id"})
    valid_df = valid_df.rename(columns={"id": "question_id"})
    
    return train_df, valid_df

# Configuration dictionary with absolute paths

cfg = {
    "train_dataset_path": "../../data/mcq_mix_v6/train_mix_mcq.csv",
    "train_support_path": "../../data/mcq_mix_v6/id2context_k2_train.json",
    "valid_dataset_path": "../../data/mcq_mix_v6/valid_mix_mcq.csv",
    "valid_support_path": "../../data/mcq_mix_v6/id2context_k2_valid.json",
}


# Example usage
train_df_mix6, valid_df_mix6 = load_and_prepare_data(cfg)


shape of train data: (15982, 8)
df shape before sanitize: (15982, 8)
df shape after sanitize: (15982, 8)
Answer Key Distribution Before Shuffling: A    5563
B    3723
C    3373
D    1936
E    1387
Name: answer, dtype: int64
Answer Key Distribution After Shuffling: A    3223
B    3123
C    3250
D    3224
E    3162
Name: answer, dtype: int64
shape of train data: (15982, 9)
shape of valid data: (300, 9)


In [5]:
#save the train_df, valid_df  as csv, named of mix_6_train_total.csv, mix_6_valid_total.csv
train_df_mix6.to_csv('../../data/mix_6_train_total.csv', index=False)
valid_df_mix6.to_csv('../../data/mix_6_valid_total.csv', index=False)

In [6]:
# Load the data
train6 = pd.read_csv('../../data/mix_6_train_total.csv')
valid6 = pd.read_csv('../../data/mix_6_valid_total.csv')

In [7]:
import json

# def convert_to_xtuner_format(data):
#     conversations = []
#     for _, row in data.iterrows():
#         question_id = row['question_id']
#         question = row['prompt']
#         support = row['support']
#         options = {
#             "A": row['A'],
#             "B": row['B'],
#             "C": row['C'],
#             "D": row['D'],
#             "E": row['E'],
#         }
#         answer = row['answer']
        
#         system_prompt = f"You are an AI assistant that helps users answer multiple-choice questions based on the provided context and options."
        
#         prompt_suffix = "".join([f"{letter}: {options[letter]}\n\n" for letter in "ABCDE"])
#         input_prompt = f"Context: {support}\n\nQuestion: {question}\n\nOptions:\n{prompt_suffix}\nAnswer:"
        
#         output_prompt = f"{answer}"
        
#         conversation = {
#             "system": system_prompt,
#             "input": input_prompt,
#             "output": output_prompt
#         }
#         conversations.append({"conversation": [conversation]})
    
#     return conversations
import json

def convert_to_xtuner_format(data):
    conversations = []
    for _, row in data.iterrows():
        question_id = row['question_id']
        question = row['prompt']
        support = row['support']
        options = {
            "A": row['A'],
            "B": row['B'],
            "C": row['C'],
            "D": row['D'],
            "E": row['E'],
        }
        answer = row['answer']

        system_prompt = "You are an AI assistant that helps users answer multiple-choice questions based on the provided context and options. The context are to support your decision-making process. The options are the possible answers to the question. Select only one option with only one letter from (A, B, C, D, or E) as your answer."

        instruction = "Carefully analyze the question and options below. Choose the most appropriate option based on the provided context. Respond with only the letter (A, B, C, D, or E) corresponding to your answer choice."

        prompt_suffix = "".join([f"{letter}: {options[letter]}\n\n" for letter in "ABCDE"])
        input_prompt = f"Context: {support}\n\nQuestion: {question}\n\nOptions:\n{prompt_suffix}\nAnswer:"

        conversation = {
            "system": system_prompt,
            "input": f"Below is an instruction that describes a task, paired with an input that provides further context that helps you select one of the most correct option from the given options for the given question. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_prompt}",
            "output": f"{answer}"
        }
        conversations.append({"conversation": [conversation]})

    return conversations

# 假设你的DataFrame名为df

# 假设你的DataFrame名为df
# xtuner_data = convert_to_xtuner_format(train3)

# with open("xtuner_data.json", "w") as f:
#     json.dump(xtuner_data, f, indent=2)

In [8]:
xtuner_data_train6 = convert_to_xtuner_format(train6)

with open("../../data/xtuner_data_train6.json", "w") as f:
    json.dump(xtuner_data_train6, f, indent=2)
    
xtuner_data_valid6 = convert_to_xtuner_format(valid6)

with open("../../data/xtuner_data_valid6.json", "w") as f:
    json.dump(xtuner_data_valid6, f, indent=2)

In [18]:
# xtuner_data

In [9]:
# print the first 1 conversation, only want the input key
print(json.dumps(xtuner_data_train6[0]['conversation'][0]['input'], indent=2))
# print(json.dumps(xtuner_data_train6[0], indent=2))

"Below is an instruction that describes a task, paired with an input that provides further context that helps you select one of the most correct option from the given options for the given question. Write a response that appropriately completes the request.\n\n### Instruction:\nCarefully analyze the question and options below. Choose the most appropriate option based on the provided context. Respond with only the letter (A, B, C, D, or E) corresponding to your answer choice.\n\n### Input:\nContext: Newtonian gravitational theory | Modern priority controversy | Since the time of Newton and Hooke, scholarly discussion has also touched on the question of whether Hooke's 1679 mention of 'compounding the motions' provided Newton with something new and valuable, even though that was not a claim actually voiced by Hooke at the time. As described above, Newton's manuscripts of the 1660s do show him actually combining tangential motion with the effects of radially directed force or endeavour, f