In [1]:
from datasets import load_dataset
from config import OPENAI_API_KEY
import os
import openai
from tqdm import tqdm
import csv
import re

In [2]:
# load race middle
train_set = load_dataset('race', 'middle', split='train')
test_set = load_dataset('race', 'middle', split='validation')
validation_set = load_dataset('race', 'middle', split='test')

In [3]:
# filter out rows with article length > 650
max_len = 650
train_set = train_set.filter(lambda example: len(example['article']) < max_len)
test_set = test_set.filter(lambda example: len(example['article']) < max_len)
validation_set = validation_set.filter(lambda example: len(example['article']) < max_len)
print(len(train_set), len(test_set), len(validation_set))

4668 307 245


In [4]:
# only sample 600 examples from train set, and proportionally sample 200 examples from validation set and test set
random_seed = 42
train_set = train_set.shuffle(seed=random_seed).select(range(600))
test_set = test_set.shuffle(seed=random_seed).select(range(200))
validation_set = validation_set.shuffle(seed=random_seed).select(range(200))

print(len(train_set), len(test_set), len(validation_set))

600 200 200


In [5]:
#take a look at the a sample of the dataset
print(train_set)
print(train_set[0]['options'])

Dataset({
    features: ['example_id', 'article', 'answer', 'question', 'options'],
    num_rows: 600
})
['In a factory.', 'In a school.', 'In a hospital.', 'In a park.']


In [6]:
# Global variables for prompt
PREFIX = '''
###### Instructions ######
Read the following article and the multiple-choice question, analyze step by step, select the correct option, and give the option letter (e.g., A or B) as your answer.
Use the following format to provide your answer and confidence level:
Explanation: [insert step-by-step analysis here]
Answer and Confidence (0-100): [Your answer, e.g., B], [Your confidence level, e.g., 80]%
Note: The confidence level indicates how certain you are about your answer, expressed as a percentage.
'''
openai.api_key = OPENAI_API_KEY

In [7]:
def get_last_processed_idx(checkpoint_file):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            last_idx = file.readline()
            return int(last_idx.strip()) if last_idx else 0
    else:
        return 0
    
def set_checkpoint_idx(checkpoint_file, idx):
    with open(checkpoint_file, 'w') as file:
        file.write(str(idx))

In [8]:
def process_dataset(dataset, csv_file_path, checkpoint_file):
    start_idx = get_last_processed_idx(checkpoint_file)
    print(f"Starting from index {start_idx}")
    for idx in tqdm(range(start_idx, len(dataset))):
        try:
            # rows are in form of 'example_id', 'article', 'answer', 'question', 'options'
            # options are in form of ['America', 'England', 'Canana', "We don't know."]
            question = dataset['question'][idx]
            article = dataset['article'][idx]
            answer = dataset['answer'][idx]
            options = dataset['options'][idx]

            formatted_options = [f"{chr(ord('A') + i)}. {option}" for i, option in enumerate(options)]
            question_input = f"###### article ######\n{article}\n\n###### Question ######\n{question}\n" + "\n".join(formatted_options)
            prompt = PREFIX + f"{question_input}"

            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a chatbot trained to answer multiple-choice questions."},
                    {"role": "user", "content": prompt},
                ]
            )

            output = response['choices'][0]['message']['content'].strip()

            explanation_match = re.search(r'Explanation: (.*)\n', output)
            explanation = explanation_match.group(1) if explanation_match else "No explanation found."

            answer_confidence_match = re.search(r'Answer and Confidence \((0-100)\): ([A-D]), (\d+)%', output)
            predicted_answer = answer_confidence_match.group(2).strip() if answer_confidence_match else "No answer found."
            confidence_level = int(answer_confidence_match.group(3)) if answer_confidence_match else "No confidence level found."

            with open(csv_file_path, 'a+', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                if os.path.getsize(csv_file_path) == 0:
                    writer.writerow(['example_id', 'question', 'article', 'options', 'predicted_answer', 'answer', 'confidence_level', 'explanation'])
                writer.writerow([idx, question, article, "\n".join(formatted_options), predicted_answer, answer, confidence_level, explanation])

            set_checkpoint_idx(checkpoint_file, idx + 1)

        except Exception as e:
            print(f"An error occurred at index {idx}: {e}")
            break

In [9]:
train_csv_output_path = 'train.csv'
validation_csv_output_path = 'validation.csv'
train_checkpoint_file = 'train_checkpoint.txt'
validation_checkpoint_file = 'validation_checkpoint.txt'

In [19]:
process_dataset(train_set, train_csv_output_path, train_checkpoint_file)

Starting from index 392


100%|██████████| 208/208 [16:36<00:00,  4.79s/it]


In [11]:
process_dataset(validation_set, validation_csv_output_path, validation_checkpoint_file)

Starting from index 0


  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [17:08<00:00,  5.14s/it]


In [13]:
# put data to jsonl format
import jsonlines
import json

# convert csv to jsonl
def convert_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        with jsonlines.open(jsonl_file_path, mode='w') as writer:
            for row in csv_reader:
                writer.write(row)

# save train and validation set to jsonl
train_jsonl_output_path = 'train.jsonl'
validation_jsonl_output_path = 'validation.jsonl'

convert_to_jsonl(train_csv_output_path, train_jsonl_output_path)
convert_to_jsonl(validation_csv_output_path, validation_jsonl_output_path)

In [16]:
# save test set to jsonl
# ensure it has same schema as train and validation set, with 'predicted_answer', 'answer', 'confidence_level', 'explanation' left to be None
test_jsonl_output_path = 'test.jsonl'
with jsonlines.open(test_jsonl_output_path, mode='w') as writer:
    for idx in range(len(test_set)):
        try:
            # rows are in form of 'example_id', 'article', 'answer', 'question', 'options'
            # options are in form of ['America', 'England', 'Canana', "We don't know."]
            question = test_set['question'][idx]
            article = test_set['article'][idx]
            answer = test_set['answer'][idx]
            options = test_set['options'][idx]

            formatted_options = [f"{chr(ord('A') + i)}. {option}" for i, option in enumerate(options)]
            question_input = f"###### article ######\n{article}\n\n###### Question ######\n{question}\n" + "\n".join(formatted_options)
            prompt = PREFIX + f"{question_input}"

            writer.write({
                'example_id': idx,
                'question': question,
                'article': article,
                'options': "\n".join(formatted_options),
                'predicted_answer': None,
                'answer': answer,
                'confidence_level': None,
                'explanation': None
            })

        except Exception as e:
            print(f"An error occurred at index {idx}: {e}")


In [17]:
# try to load the uploaded dataset from huggingface (BENBENBENb/RACE1000COT)
from datasets import load_dataset
train = load_dataset('BENBENBENb/RACE1000COT', 'train')
validation = load_dataset('BENBENBENb/RACE1000COT', 'validation')
test = load_dataset('BENBENBENb/RACE1000COT', 'test')

Downloading readme:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/619k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/209k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]