In [12]:
import datasets
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import random
import json
import os

In [3]:
sciq = load_dataset("allenai/sciq")

In [4]:
def sciq_add_choices_to_question(sample:dict):
    answer_options = ["A", "B", "C", "D"]
    question_options = [sample[f"distractor{i}"] for i in range(1, 4)] + [sample["correct_answer"]]
    random.shuffle(question_options)
    answer = answer_options[question_options.index(sample["correct_answer"])]
    question_options = [f"{a}. " + q for a, q in zip(answer_options, question_options)]
    sample["question"] = "Question: " + sample["question"] +"\n\nOptions:\n" + "\n".join(question_options) +"\n\nAnswer:"
    sample["answer"] = answer
    sample["subject"] = None
    for useless_key in [f"distractor{i}" for i in range(1, 4)] + ["correct_answer", "support"]:
        sample.pop(useless_key)
    return sample

for split in ["train", "validation", "test"]:
    sciq[split] = sciq[split].map(sciq_add_choices_to_question)
    sciq[split].to_json(os.path.join("project-code-2024", "datasets", f"mcqa_sciq_{split}.jsonl"))

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
def mmlu_add_choices_to_question(sample:dict)->dict:
    answer_options = ["A", "B", "C", "D"]
    question_options = [f"{a}. " + q for a, q in zip(answer_options, sample["choices"])]
    sample["question"] = "Question: " + sample["question"] +"\n\nOptions:\n" + "\n".join(question_options) +"\n\nAnswer:"
    sample['answer'] = answer_options[sample['answer']]
    for useless_key in ["choices"]:
        sample.pop(useless_key)
    return sample

relevant_subjects = ['abstract_algebra', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning']
all_subjects_data = {
    "test":[],
    "validation":[],
    "dev":[]
}
for subject in relevant_subjects:
    mmlu_subject = load_dataset("cais/mmlu", subject)
    for split in ["test", "validation", "dev"]:
        # print(f"{mmlu_subject[split]['answer'][0]=}")
        # mmlu_subject[split] = mmlu_subject[split].map(mmlu_add_choices_to_question)
        # print(f"{mmlu_subject[split]['answer'][0]=}")
        all_subjects_data[split].append(mmlu_subject[split].map(mmlu_add_choices_to_question))

for split in ["test", "validation", "dev"]:
    concatenate_datasets(all_subjects_data[split]).to_json(os.path.join("project-code-2024", "datasets", f"mcqa_mmlu_{split}.jsonl"))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/235 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
ai2_arc = load_dataset('allenai/ai2_arc', 'ARC-Challenge')

In [17]:
def sciq_add_choices_to_question(sample:dict):
    question_options = [f"{a}. " + q for a, q in zip(sample['choices']['label'], sample['choices']['text'])]
    sample["question"] = "Question: " + sample["question"] +"\n\nOptions:\n" + "\n".join(question_options) +"\n\nAnswer:"
    sample["answer"] = sample["answerKey"]
    sample["subject"] = None
    for useless_key in ["answerKey", "choices", "id"]:
        sample.pop(useless_key)
    return sample

for split in ["train", "validation", "test"]:
    ai2_arc[split] = ai2_arc[split].map(sciq_add_choices_to_question)
    ai2_arc[split].to_json(os.path.join("project-code-2024", "datasets", f"mcqa_ai2_arc_{split}.jsonl"))

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/1172 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [21]:
# !git clone git@github.com:idrori/stemQ.git

In [12]:
stemQ_files = []
for root, dirs, files in os.walk("/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data"):
    for file in files:
        if file.endswith('.json'):
            print(os.path.join(root, file))
            stemQ_files.append(os.path.join(root, file))

/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_08.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_04.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_12.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_13.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_05.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_09.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_02.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_14.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_18.json
/Users/wesleymonteith/code/project-m2-2024-chatbots-r-us/stemQ/data/8.04/8.04_Question_22.json
/Users/wesleymonteith/code/project-m2-2024-chatbot

In [14]:
stemQ = load_dataset("json", data_files=stemQ_files)

Resolving data files:   0%|          | 0/667 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [22]:
relevant_courses = ['Calculus II', 'Calculus for Wharton Students',
'Computer Architecture',
'Fundamentals of Physics I', 'Hydrodynamics',
'Intermediate Calculus', 'Introduction to Machine Learning',
'Introduction to Probability',
'Mathematics for Materials Scientists and Engineers',
'Nonlinear Dynamics I: Chaos',
'Probability',
'Probability and Random Variables', 'Quantum Physics',
'Signals and Systems', 'Statistical Thinking and Data Analysis',
'Theory of Numbers',
'Unified Engineering 1 and 2', 'Unified Engineering 3 and 4']

stemQ["train"] = stemQ["train"].filter(lambda sample: sample["Course"] in relevant_courses)

In [28]:
def stemQ_add_choices_to_question(sample:dict):
    sample["question"] = sample["Original question"]
    sample["answer"] = sample["Solution"]
    sample["subject"] = sample["Course"]
    for useless_key in ['Course', 'Topic', 'Original question', 'Solution']:
        sample.pop(useless_key)
    return sample

stemQ["train"] = stemQ["train"].map(stemQ_add_choices_to_question)

Map:   0%|          | 0/441 [00:00<?, ? examples/s]

In [30]:
stemQ["train"].to_json(os.path.join("project-code-2024", "datasets", "sft_stemQ_train.jsonl"))

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

160591