In [1]:
from datasets import load_dataset, concatenate_datasets
from config import OPENAI_API_KEY
import os
import openai
from tqdm import tqdm
import csv
import re

In [2]:
# load the dataset, with 600 examples
train = load_dataset("commonsense_qa", split="train")
val = load_dataset("commonsense_qa", split="validation")
test = load_dataset("commonsense_qa", split="test")

# sample 600 examples from the training set
train = train.shuffle(seed=42).select(range(600))
val = val.shuffle(seed=42).select(range(200))
test = test.shuffle(seed=42).select(range(200))

In [3]:
#take a look at the a sample of the dataset
print(train)
print(train[0]['choices'])

Dataset({
    features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
    num_rows: 600
})
{'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['table', 'post office', "neighbor's house", 'railway station', 'fridge']}


In [4]:
# Global variables for prompt
PREFIX = '''
###### Instructions ######
Read the following multiple-choice question, analyze step by step, select the correct option, and give the option letter (e.g., A or B) as your answer.
Use the following format to provide your answer and confidence level:
Explanation: [insert step-by-step analysis here]
Answer and Confidence (0-100): [Your answer, e.g., B], [Your confidence level, e.g., 80]%
Note: The confidence level indicates how certain you are about your answer, expressed as a percentage.
'''
openai.api_key = OPENAI_API_KEY

In [5]:
# This function reads the last processed index from a checkpoint file.
# If the checkpoint file exists and contains a number, it returns that number as an integer.
# If the checkpoint file is empty or does not exist, it returns 0.
def get_last_processed_idx(checkpoint_file):
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            last_idx = file.readline()
            return int(last_idx.strip()) if last_idx else 0
    else:
        return 0
    
# This function writes the given index to a checkpoint file.
# This is used to save the current progress, so if the process is interrupted,
# it can resume from the last saved index instead of starting over.
def set_checkpoint_idx(checkpoint_file, idx):
    with open(checkpoint_file, 'w') as file:
        file.write(str(idx))

In [6]:
# This function processes a dataset of multiple-choice questions by iterating from the last processed index.
# It formats each question and its associated options, sends them to a GPT model for completion,
# then records the model's predicted answer, confidence level, and explanation in a CSV file.
# It updates the checkpoint after each entry is processed to ensure resumability of the task.
def process_dataset(dataset, csv_file_path, checkpoint_file):
    '''
    dataset: the dataset to process
    csv_file_path: the path to the CSV file to write the results to
    checkpoint_file: the path to the checkpoint file to save the last processed index to
    '''
    start_idx = get_last_processed_idx(checkpoint_file)
    print(f"Starting from index {start_idx}")
    for idx in tqdm(range(start_idx, len(dataset))):
        try:
            # rows are in form of ['id', 'question', 'question_concept', 'choices', 'answerKey']
            # choices are in form of {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['table', 'post office', "neighbor's house", 'railway station', 'fridge']}
            question = dataset['question'][idx]
            answerKey = dataset['answerKey'][idx]
            question_concept = dataset['question_concept'][idx]
            choices = dataset['choices'][idx]

            # {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['table', 'post office', "neighbor's house", 'railway station', 'fridge']}
            # formatted_options = [f"{chr(ord('A') + i)}. {option}" for i, option in enumerate(choices)]
            formatted_options = [f"{choices['label'][i]}. {choices['text'][i]}" for i in range(len(choices['label']))]
            question_input = f"###### Question ######\n{question}\n" + "\n".join(formatted_options)
            prompt = PREFIX + f"{question_input}"

            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a chatbot trained to answer multiple-choice questions."},
                    {"role": "user", "content": prompt},
                ]
            )

            output = response['choices'][0]['message']['content'].strip()

            explanation_match = re.search(r'Explanation: (.*)\n', output)
            explanation = explanation_match.group(1) if explanation_match else "No explanation found."

            answer_confidence_match = re.search(r'Answer and Confidence \((0-100)\): ([A-D]), (\d+)%', output)
            predicted_answer = answer_confidence_match.group(2).strip() if answer_confidence_match else "No answer found."
            confidence_level = int(answer_confidence_match.group(3)) if answer_confidence_match else "No confidence level found."

            with open(csv_file_path, 'a+', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                if os.path.getsize(csv_file_path) == 0:
                    writer.writerow(["id", "question", "question_concept", "choices", "predicted_answer", "answerKey", "confidence_level", "explanation"])
                writer.writerow([idx, question, question_concept, choices, predicted_answer, answerKey, confidence_level, explanation])

            set_checkpoint_idx(checkpoint_file, idx + 1)

        except Exception as e:
            print(f"An error occurred at index {idx}: {e}")
            break

In [8]:
train_csv_output_path = 'train.csv'
validation_csv_output_path = 'validation.csv'
train_checkpoint_file = 'train_checkpoint.txt'
validation_checkpoint_file = 'validation_checkpoint.txt'

In [12]:
process_dataset(train, train_csv_output_path, train_checkpoint_file)

Starting from index 593


100%|██████████| 7/7 [00:47<00:00,  6.78s/it]


In [10]:
process_dataset(val, validation_csv_output_path, validation_checkpoint_file)

Starting from index 16


100%|██████████| 184/184 [28:52<00:00,  9.42s/it]


In [11]:
# Now we convert a CSV file to a JSONL (JSON Lines) to ho on platforms like Hugging Face.
import jsonlines
import json

# convert csv to jsonl
def convert_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        with jsonlines.open(jsonl_file_path, mode='w') as writer:
            for row in csv_reader:
                writer.write(row)

# save train and validation set to jsonl
train_jsonl_output_path = 'train.jsonl'
validation_jsonl_output_path = 'validation.jsonl'

convert_to_jsonl(train_csv_output_path, train_jsonl_output_path)
convert_to_jsonl(validation_csv_output_path, validation_jsonl_output_path)

In [13]:
# save test set to jsonl
# ensure it has same schema as train and validation set, with 'predicted_answer', 'confidence_level', 'explanation' left to be None
test_jsonl_output_path = 'test.jsonl'
with jsonlines.open(test_jsonl_output_path, mode='w') as writer:
    for idx in range(len(test)):
        try:
            # rows are in form of 'example_id', 'article', 'answer', 'question', 'options'
            question = test['question'][idx]
            answerKey = test['answerKey'][idx]
            choices = test['choices'][idx]
            concept = test['question_concept'][idx]
            
            formatted_options = [f"{choices['label'][i]}. {choices['text'][i]}" for i in range(len(choices['label']))]

            writer.write({
                'id': str(idx),
                "question": question,
                "question_concept": concept,
                "choices": "\n".join(formatted_options),
                "predicted_answer": None,
                "answerKey": answerKey,
                "confidence_level": None,
                "explanation": None
            })
            

        except Exception as e:
            print(f"An error occurred at index {idx}: {e}")

In [15]:
# print shema of train, validation, and test set for jsonl
import json

def print_jsonl_schema(jsonl_file_path):
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        first_line = file.readline()
        json_object = json.loads(first_line)

        schema = {key: type(value).__name__ for key, value in json_object.items()}
        print(json.dumps(schema, indent=2))

print_jsonl_schema(train_jsonl_output_path)
# print_jsonl_schema(validation_jsonl_output_path)
print_jsonl_schema(test_jsonl_output_path)

{
  "id": "str",
  "question": "str",
  "question_concept": "str",
  "choices": "str",
  "predicted_answer": "str",
  "answerKey": "str",
  "confidence_level": "str",
  "explanation": "str"
}
{
  "id": "str",
  "question": "str",
  "question_concept": "str",
  "choices": "str",
  "predicted_answer": "NoneType",
  "answerKey": "str",
  "confidence_level": "NoneType",
  "explanation": "NoneType"
}


In [16]:
# try to load the uploaded dataset from huggingface BENBENBENb/CommonsenseQA1000COT
from datasets import load_dataset
train_huggingface_dataset = load_dataset('BENBENBENb/CommonsenseQA1000COT', 'train')
validation_huggingface_dataset = load_dataset('BENBENBENb/CommonsenseQA1000COT', 'validation')
test_huggingface_dataset = load_dataset('BENBENBENb/CommonsenseQA1000COT', 'test')

Downloading readme:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/514k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]