In [1]:
from datasets import load_dataset
from config import OPENAI_API_KEY
import os
import openai
from tqdm import tqdm
import csv
import re

In [2]:
# load sagnikrayc/mctest
train_set = load_dataset('sagnikrayc/mctest', 'mc160', split='train')
test_set = load_dataset('sagnikrayc/mctest', 'mc160', split='test')
validation_set = load_dataset('sagnikrayc/mctest', 'mc160', split='validation')

In [3]:
#take a look at the a sample of the dataset
print(train_set)
print('---answer options---')
print(train_set[0]['answer_options'])

Dataset({
    features: ['idx', 'question', 'story', 'properties', 'answer_options', 'answer', 'question_is_multiple'],
    num_rows: 280
})
---answer options---
{'A': 'Door', 'B': 'House', 'C': 'Window', 'D': 'Toilet'}


In [4]:
# Global variables for prompt
PREFIX = '''
###### Instructions ######
Read the following story and the multiple-choice question, analyze step by step, select the correct option, and give the option letter (e.g., A or B) as your answer.
Use the following format to provide your answer and confidence level:
Explanation: [insert step-by-step analysis here]
Answer and Confidence (0-100): [Your answer, e.g., B], [Your confidence level, e.g., 80]%
Note: The confidence level indicates how certain you are about your answer, expressed as a percentage.
'''
openai.api_key = OPENAI_API_KEY

In [5]:
# This function reads the last processed index from a checkpoint file.
# If the checkpoint file exists and contains a number, it returns that number as an integer.
# If the checkpoint file is empty or does not exist, it returns 0.
def get_last_processed_idx(checkpoint_file):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            last_idx = file.readline()
            return int(last_idx.strip()) if last_idx else 0
    else:
        return 0
    
# This function writes the given index to a checkpoint file.
# This is used to save the current progress, so if the process is interrupted,
# it can resume from the last saved index instead of starting over.
def set_checkpoint_idx(checkpoint_file, idx):
    with open(checkpoint_file, 'w') as file:
        file.write(str(idx))

In [6]:
# This function processes a dataset of multiple-choice questions by iterating from the last processed index.
# It formats each question and its associated options, sends them to a GPT model for completion,
# then records the model's predicted answer, confidence level, and explanation in a CSV file.
# It updates the checkpoint after each entry is processed to ensure resumability of the task.
def process_dataset(dataset, csv_file_path, checkpoint_file):
    '''
    dataset: the dataset to process
    csv_file_path: the path to the CSV file to write the results to
    checkpoint_file: the path to the checkpoint file to save the last processed index to
    '''
    start_idx = get_last_processed_idx(checkpoint_file)
    print(f"Starting from index {start_idx}")
    for idx in tqdm(range(start_idx, len(dataset))):
        try:
            question = dataset['question'][idx]
            story = dataset['story'][idx]
            options = dataset['answer_options'][idx]
            answer = dataset['answer'][idx]

            formatted_options = [f"{key}: {value}" for key, value in options.items()]
            question_input = f"###### Story ######\n{story}\n\n###### Question ######\n{question}\n" + "\n".join(formatted_options)
            prompt = PREFIX + f"{question_input}"

            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a chatbot trained to answer multiple-choice questions."},
                    {"role": "user", "content": prompt},
                ]
            )

            output = response['choices'][0]['message']['content'].strip()

            explanation_match = re.search(r'Explanation: (.*)\n', output)
            explanation = explanation_match.group(1) if explanation_match else "No explanation found."

            answer_confidence_match = re.search(r'Answer and Confidence \((0-100)\): ([A-D]), (\d+)%', output)
            predicted_answer = answer_confidence_match.group(2).strip() if answer_confidence_match else "No answer found."
            confidence_level = int(answer_confidence_match.group(3)) if answer_confidence_match else "No confidence level found."

            with open(csv_file_path, 'a+', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                if os.path.getsize(csv_file_path) == 0:
                    writer.writerow(["IDX", "Question", "Story", "Options", "Predicted Answer", "Actual Answer", "Confidence Level", "Explanation"])
                writer.writerow([idx, question, story, "\n".join(formatted_options), predicted_answer, answer, confidence_level, explanation])

            set_checkpoint_idx(checkpoint_file, idx + 1)

        except Exception as e:
            print(f"An error occurred at index {idx}: {e}")
            # If you want to stop on error use 'break', to continue to the next record use 'continue'
            break

In [1]:
train_csv_output_path = 'train.csv'
test_csv_output_path = 'test.csv'
validation_csv_output_path = 'validation.csv'
train_checkpoint_file = 'train_checkpoint.txt'
test_checkpoint_file = 'test_checkpoint.txt'
validation_checkpoint_file = 'validation_checkpoint.txt'

In [8]:
process_dataset(train_set, train_csv_output_path, train_checkpoint_file)

100%|██████████| 41/41 [04:59<00:00,  7.30s/it]


In [9]:
process_dataset(test_set, test_csv_output_path, test_checkpoint_file)

100%|██████████| 240/240 [29:19<00:00,  7.33s/it]


In [10]:
process_dataset(validation_set, validation_csv_output_path, validation_checkpoint_file)

100%|██████████| 120/120 [15:06<00:00,  7.56s/it]


# Prepare for Hugging Face Dataset

In [6]:
# Now we convert a CSV file to a JSONL (JSON Lines) to ho on platforms like Hugging Face.
import jsonlines
import json

def convert_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        with jsonlines.open(jsonl_file_path, mode='w') as writer:
            for row in csv_reader:
                writer.write(row)

train_jsonl_output_path = 'train.jsonl'
test_jsonl_output_path = 'test.jsonl'
validation_jsonl_output_path = 'validation.jsonl'

convert_to_jsonl(train_csv_output_path, train_jsonl_output_path)
convert_to_jsonl(test_csv_output_path, test_jsonl_output_path)
convert_to_jsonl(validation_csv_output_path, validation_jsonl_output_path)

# Test to import from Hugging Face

In [7]:
# try to load the uploaded dataset from huggingface BENBENBENb/McTest640COT
mc160_train = load_dataset('BENBENBENb/McTest640COT', split='train')
mc160_test = load_dataset('BENBENBENb/McTest640COT', split='test')
mc160_validation = load_dataset('BENBENBENb/McTest640COT', split='validation')

Downloading readme:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/207k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/394k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]