In [1]:
import datasets
from datasets import load_dataset, concatenate_datasets, DatasetDict
import pandas as pd
import random
import json
import os
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string

In [2]:
m1_data = load_dataset("json", data_files="../model/datasets/dpo_M1_preference_data_15052024.json")['train']

In [3]:
characteristic_mcqa_pattern = '\n\nOptions:\nA. '

In [4]:
m1_data = m1_data.filter(lambda sample: characteristic_mcqa_pattern in sample['question_complete'])

In [5]:
m1_data

Dataset({
    features: ['question_complete', 'question_id', 'preference', 'course_id'],
    num_rows: 792
})

In [6]:
question_options = ['A. FIFO (First In, First Out)', 'B. SJF (Shortest Job First)', 'C. STCF (Shortest Time to Completion First)', 'D. RR (Round Robin)']
answer = m1_data['preference'][0][1]['A']

vectorizer = TfidfVectorizer().fit_transform(question_options + [answer])
vectors = vectorizer.toarray()

cosine_sim = cosine_similarity(vectors)
print(f"Cosine similarity: {cosine_sim[-1, :-1].argmax()}")

Cosine similarity: 1


In [8]:
def extract_options(input_string):
    # Split the string to separate the options part
    _, options_part = input_string.split('Options:\n')

    # Use regex to find all options
    options = re.findall(r'([A-D]\. .+?)(?=\n[A-D]\.|$)', options_part, re.DOTALL)

    return options

def get_correct_index_from_sample(sample:dict)->dict:
    question_options = extract_options(sample['question_complete'])
    # print(f"{question_options=}")
    answers_accumulator = np.zeros_like(question_options, dtype=int)
    # print(f"{answers_accumulator=}")
    nb_errors = 0
    total_trials = 0
    
    for preference_pair in sample['preference']:
        correct_answer_keys = preference_pair['criteria']['correctness'] # either 'A', 'B', 'AB' or ''
        # print(f"{correct_answer_keys=}")
        total_trials += len(correct_answer_keys)
        for correct_answer_key in correct_answer_keys: 
            try:
                correct_answer = preference_pair[correct_answer_key]
                # print(f"{correct_answer=}")
                vectorizer = TfidfVectorizer().fit_transform(question_options + [correct_answer])
                vectors = vectorizer.toarray()
                cosine_sim = cosine_similarity(vectors)
                answers_accumulator[cosine_sim[-1, :-1].argmax()] += 1
                # print(f"{answers_accumulator=}")
            except:
                nb_errors += 1
                continue
    if nb_errors > 0:
        print(f"There was an error {nb_errors} / {total_trials} times i.e. {100*nb_errors/total_trials:.2f} %")
    sample['answer'] = string.ascii_uppercase[answers_accumulator.argmax()]
    sample['question'] = sample['question_complete']
    for useless_key in ['question_complete', 'question_id', 'preference', 'course_id']:
        sample.pop(useless_key)
    return sample

In [9]:
m1_data = m1_data.map(get_correct_index_from_sample)

In [10]:
np.unique(m1_data['answer'], return_counts=True)

(array(['A', 'B', 'C', 'D'], dtype='<U1'), array([302, 188, 144, 158]))

In [12]:
m1_data['answer']

['C',
 'A',
 'B',
 'B',
 'A',
 'D',
 'A',
 'B',
 'D',
 'B',
 'C',
 'B',
 'B',
 'D',
 'B',
 'C',
 'B',
 'B',
 'D',
 'D',
 'D',
 'A',
 'A',
 'B',
 'A',
 'B',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'D',
 'A',
 'D',
 'B',
 'C',
 'D',
 'A',
 'D',
 'D',
 'D',
 'A',
 'C',
 'A',
 'D',
 'D',
 'D',
 'A',
 'A',
 'D',
 'B',
 'C',
 'B',
 'A',
 'A',
 'A',
 'A',
 'B',
 'D',
 'B',
 'C',
 'C',
 'C',
 'B',
 'A',
 'A',
 'A',
 'A',
 'B',
 'A',
 'D',
 'A',
 'A',
 'B',
 'B',
 'A',
 'A',
 'D',
 'B',
 'A',
 'A',
 'A',
 'B',
 'D',
 'B',
 'D',
 'A',
 'A',
 'B',
 'A',
 'A',
 'B',
 'A',
 'D',
 'C',
 'B',
 'B',
 'A',
 'B',
 'C',
 'B',
 'A',
 'B',
 'D',
 'D',
 'C',
 'A',
 'D',
 'A',
 'D',
 'C',
 'A',
 'B',
 'B',
 'B',
 'D',
 'C',
 'B',
 'A',
 'D',
 'A',
 'D',
 'A',
 'A',
 'C',
 'A',
 'B',
 'C',
 'C',
 'B',
 'B',
 'A',
 'A',
 'D',
 'A',
 'B',
 'B',
 'D',
 'A',
 'B',
 'B',
 'D',
 'B',
 'A',
 'B',
 'B',
 'C',
 'C',
 'D',
 'A',
 'B',
 'A',
 'A',
 'B',
 'B',
 'A',
 'B',
 'B',
 'C',
 'A',
 'A',
 'B',
 'A',
 'A'

In [13]:
m1_data = m1_data.class_encode_column("answer")
m1_data['answer']

[2,
 0,
 1,
 1,
 0,
 3,
 0,
 1,
 3,
 1,
 2,
 1,
 1,
 3,
 1,
 2,
 1,
 1,
 3,
 3,
 3,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 3,
 1,
 2,
 3,
 0,
 3,
 3,
 3,
 0,
 2,
 0,
 3,
 3,
 3,
 0,
 0,
 3,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 3,
 1,
 2,
 2,
 2,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 3,
 0,
 0,
 1,
 1,
 0,
 0,
 3,
 1,
 0,
 0,
 0,
 1,
 3,
 1,
 3,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 3,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 0,
 1,
 3,
 3,
 2,
 0,
 3,
 0,
 3,
 2,
 0,
 1,
 1,
 1,
 3,
 2,
 1,
 0,
 3,
 0,
 3,
 0,
 0,
 2,
 0,
 1,
 2,
 2,
 1,
 1,
 0,
 0,
 3,
 0,
 1,
 1,
 3,
 0,
 1,
 1,
 3,
 1,
 0,
 1,
 1,
 2,
 2,
 3,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 2,
 0,
 0,
 1,
 3,
 0,
 0,
 1,
 0,
 2,
 2,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 2,
 0,
 1,
 1,
 3,
 1,
 3,
 1,
 1,
 0,
 3,
 2,
 0,
 3,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 3,
 0,
 3,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 3,
 0,
 2,
 0,
 1,
 2,
 0,
 1,
 2,
 2,
 1,
 2,
 1,
 3,
 3,
 0,
 2,
 3,
 1,
 2,
 0,


In [11]:
# m1_data = m1_data.class_encode_column("answer")
# 80% train, 10% test + 10% validation
train_testvalid = m1_data.train_test_split(test_size=0.2, stratify_by_column='answer')
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, stratify_by_column='answer')
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'evaluation': test_valid['train']})

ValueError: Stratifying by column is only supported for ClassLabel column, and column answer is Value.

In [47]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question'],
        num_rows: 633
    })
    test: Dataset({
        features: ['answer', 'question'],
        num_rows: 80
    })
    evaluation: Dataset({
        features: ['answer', 'question'],
        num_rows: 79
    })
})

In [45]:
633 + 80 + 79

792

In [49]:
for split in train_test_valid_dataset.keys():
    train_test_valid_dataset[split].to_json(os.path.join("..","model", "datasets", "mcqa", split, f"mcqa_M1_{split}.jsonl"))

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]