In [2]:
from datasets import load_dataset, concatenate_datasets
from config import OPENAI_API_KEY
import os
import openai
from tqdm import tqdm
import csv
import re

In [6]:
# load dataset
ARC_easy_train = load_dataset('ai2_arc', 'ARC-Easy', split='train')
ARC_easy_test = load_dataset('ai2_arc', 'ARC-Easy', split='test')
ARC_easy_validation = load_dataset('ai2_arc', 'ARC-Easy', split='validation')

ARC_Challenge_train = load_dataset('ai2_arc', 'ARC-Challenge', split='train')
ARC_Challenge_test = load_dataset('ai2_arc', 'ARC-Challenge', split='test')
ARC_Challenge_validation = load_dataset('ai2_arc', 'ARC-Challenge', split='validation')

# we only want a subset of the data
# 600 examples from train set, and proportionally sample 200 examples from validation set and test set
# each contains 1/2 from ARC-Easy and 1/2 from ARC-Challenge
# unsupported operand type(s) for +: 'Dataset' and 'Dataset'
random_seed = 42
train_set = concatenate_datasets([ARC_easy_train.shuffle(seed=random_seed).select(range(300)), ARC_Challenge_train.shuffle(seed=random_seed).select(range(300))])
test_set = concatenate_datasets([ARC_easy_test.shuffle(seed=random_seed).select(range(100)), ARC_Challenge_test.shuffle(seed=random_seed).select(range(100))])
validation_set = concatenate_datasets([ARC_easy_validation.shuffle(seed=random_seed).select(range(100)), ARC_Challenge_validation.shuffle(seed=random_seed).select(range(100))])

In [28]:
print(train_set)
print(train_set[0]['choices'])
print(train_set[0]['question'])

# get first idx with answerKey is in [1, 2, 3, 4]
def get_first_idx_with_answerKey_in_1234(dataset):
    for i in range(len(dataset)):
        if dataset[i]['answerKey'] in ['1', '2', '3', '4']:
            return i
    return -1

get_first_idx_with_answerKey_in_1234(train_set)

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 600
})
{'text': ['aluminum', 'copper', 'glass', 'wood'], 'label': ['A', 'B', 'C', 'D']}
Which of the following materials would best slow the transfer of heat?


41

In [13]:
# print all different values of answerKey
answerKey_set = set()
for example in train_set:
    answerKey_set.add(example['answerKey'])
print(answerKey_set)

{'B', '1', 'C', '3', 'A', '2', 'D', '4'}


In [14]:
# Global variables for prompt
PREFIX = '''
###### Instructions ######
Read the following multiple-choice question, analyze step by step, select the correct option, and give the option letter (e.g., A or B) as your answer.
Use the following format to provide your answer and confidence level:
Explanation: [insert step-by-step analysis here]
Answer and Confidence (0-100): [Your answer, e.g., B, 1], [Your confidence level, e.g., 80]%
Note: The confidence level indicates how certain you are about your answer, expressed as a percentage.
'''
openai.api_key = OPENAI_API_KEY

In [15]:
def get_last_processed_idx(checkpoint_file):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            last_idx = file.readline()
            return int(last_idx.strip()) if last_idx else 0
    else:
        return 0
    
def set_checkpoint_idx(checkpoint_file, idx):
    with open(checkpoint_file, 'w') as file:
        file.write(str(idx))

In [32]:
def process_dataset(dataset, csv_file_path, checkpoint_file):
    start_idx = get_last_processed_idx(checkpoint_file)
    print(f"Starting from index {start_idx}")
    # for idx in tqdm(range(41, 41 + 1)):
    for idx in tqdm(range(start_idx, len(dataset))):
        try:
            # rows are in form of ['id', 'question', 'choices', 'answerKey']
            # choices are in form of {'text': ['aluminum', 'copper', 'glass', 'wood'], 'label': ['A', 'B', 'C', 'D']} or { "text": [ "their color", "their shape", "how they formed", "the minerals they contain" ], "label": [ "1", "2", "3", "4" ] }
            # we want to convert 1, 2, 3, 4 to A, B, C, D if 1, 2, 3, 4 are present
            question = dataset[idx]['question']
            choices = dataset[idx]['choices']
            answer_key = dataset[idx]['answerKey']
        
            # print(f"Question: {question}")
            # print(f"Choices: {choices}")
            # print(f"Answer key: {answer_key}")

             # for choices, we want to convert 1, 2, 3, 4 to A, B, C, D if 1, 2, 3, 4 are present
            if choices['label'][0] in ['1', '2', '3', '4']:
                choices['label'] = [chr(ord('A') + int(label) - 1) for label in choices['label']]

            # we want to convert 1, 2, 3, 4 to A, B, C, D if 1, 2, 3, 4 are present
            if answer_key in ['1', '2', '3', '4']:
                answer_key = chr(ord('A') + int(answer_key) - 1)
            
            # print(f"Question: {question}")
            # print(f"Choices: {choices}")
            # print(f"Answer key: {answer_key}")
            
            # formatted_options = [f"{chr(ord('A') + i)}. {option}" for i, option in enumerate(options)]
            formatted_options = [f"{label}. {option}" for label, option in zip(choices['label'], choices['text'])]

            # print(f"Formatted options: {formatted_options}")
            
            question_input = f"###### Question ######\n{question}\n" + "\n".join(formatted_options)
            prompt = PREFIX + f"{question_input}"

            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a chatbot trained to answer multiple-choice questions."},
                    {"role": "user", "content": prompt},
                ]
            )

            output = response['choices'][0]['message']['content'].strip()

            explanation_match = re.search(r'Explanation: (.*)\n', output)
            explanation = explanation_match.group(1) if explanation_match else "No explanation found."

            answer_confidence_match = re.search(r'Answer and Confidence \((0-100)\): ([A-D]), (\d+)%', output)
            predicted_answer = answer_confidence_match.group(2).strip() if answer_confidence_match else "No answer found."
            confidence_level = int(answer_confidence_match.group(3)) if answer_confidence_match else "No confidence level found."

            with open(csv_file_path, 'a+', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                if os.path.getsize(csv_file_path) == 0:
                    writer.writerow(['id', 'question', 'choices', 'answerKey', 'predicted_answer', 'confidence_level', 'explanation'])
                writer.writerow([idx, question, choices, answer_key, predicted_answer, confidence_level, explanation])
               

            set_checkpoint_idx(checkpoint_file, idx + 1)

        except Exception as e:
            print(f"An error occurred at index {idx}: {e}")
            break

In [3]:
train_csv_output_path = 'train.csv'
validation_csv_output_path = 'validation.csv'
train_checkpoint_file = 'train_checkpoint.txt'
validation_checkpoint_file = 'validation_checkpoint.txt'

In [34]:
process_dataset(train_set, train_csv_output_path, train_checkpoint_file)

Starting from index 0


100%|██████████| 600/600 [1:17:37<00:00,  7.76s/it]


In [35]:
process_dataset(validation_set, validation_csv_output_path, validation_checkpoint_file)

Starting from index 0


100%|██████████| 200/200 [25:48<00:00,  7.74s/it]


In [4]:
# put data to jsonl format
import jsonlines
import json

# convert csv to jsonl
def convert_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        with jsonlines.open(jsonl_file_path, mode='w') as writer:
            for row in csv_reader:
                writer.write(row)

# save train and validation set to jsonl
train_jsonl_output_path = 'train.jsonl'
validation_jsonl_output_path = 'validation.jsonl'

convert_to_jsonl(train_csv_output_path, train_jsonl_output_path)
convert_to_jsonl(validation_csv_output_path, validation_jsonl_output_path)

In [19]:
# save test set to jsonl
# ensure it has same schema as train and validation set, with 'predicted_answer', 'answer', 'confidence_level', 'explanation' left to be None
test_jsonl_output_path = 'test.jsonl'
with jsonlines.open(test_jsonl_output_path, mode='w') as writer:
    for idx in range(len(test_set)):
        try:
            # rows are in form of ['id', 'question', 'choices', 'answerKey']
            # choices are in form of {'text': ['aluminum', 'copper', 'glass', 'wood'], 'label': ['A', 'B', 'C', 'D']} or { "text": [ "their color", "their shape", "how they formed", "the minerals they contain" ], "label": [ "1", "2", "3", "4" ] }
            # we want to convert 1, 2, 3, 4 to A, B, C, D if 1, 2, 3, 4 are present
            question = test_set[idx]['question']
            choices = test_set[idx]['choices']
            answer_key = test_set[idx]['answerKey']

             # for choices, we want to convert 1, 2, 3, 4 to A, B, C, D if 1, 2, 3, 4 are present
            if choices['label'][0] in ['1', '2', '3', '4']:
                choices['label'] = [chr(ord('A') + int(label) - 1) for label in choices['label']]

            # we want to convert 1, 2, 3, 4 to A, B, C, D if 1, 2, 3, 4 are present
            if answer_key in ['1', '2', '3', '4']:
                answer_key = chr(ord('A') + int(answer_key) - 1)
            
            # formatted_options = [f"{chr(ord('A') + i)}. {option}" for i, option in enumerate(options)]
            formatted_options = [f"{label}. {option}" for label, option in zip(choices['label'], choices['text'])]

            writer.write({
                'id': str(idx),
                'question': question,
                'choices': "\n".join(formatted_options),
                'answerKey': answer_key,
                'predicted_answer': None,
                'confidence_level': None,
                'explanation': None
            })
        except Exception as e:
            print(f"An error occurred at index {idx}: {e}")
            break

In [20]:
# print shema of train, validation, and test set for jsonl
import json

def print_jsonl_schema(jsonl_file_path):
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        first_line = file.readline()
        json_object = json.loads(first_line)

        schema = {key: type(value).__name__ for key, value in json_object.items()}
        print(json.dumps(schema, indent=2))

print_jsonl_schema(train_jsonl_output_path)
print_jsonl_schema(validation_jsonl_output_path)
print_jsonl_schema(test_jsonl_output_path)

{
  "id": "str",
  "question": "str",
  "choices": "str",
  "answerKey": "str",
  "predicted_answer": "str",
  "confidence_level": "str",
  "explanation": "str"
}
{
  "id": "str",
  "question": "str",
  "choices": "str",
  "answerKey": "str",
  "predicted_answer": "str",
  "confidence_level": "str",
  "explanation": "str"
}
{
  "id": "str",
  "question": "str",
  "choices": "str",
  "answerKey": "str",
  "predicted_answer": "NoneType",
  "confidence_level": "NoneType",
  "explanation": "NoneType"
}


In [21]:
# try to load the uploaded dataset from huggingface BENBENBENb/ARC1000COT
from datasets import load_dataset
train_huggingface_dataset = load_dataset('BENBENBENb/ARC1000COT', 'train')
validation_huggingface_dataset = load_dataset('BENBENBENb/ARC1000COT', 'validation')
test_huggingface_dataset = load_dataset('BENBENBENb/ARC1000COT', 'test')

Downloading readme:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/527k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]