In [38]:
#Code origin
#Author: Alexander Valentini
#I remove entries with more than 1024 tokens and add the chat template to the mcqa dataset

from datasets import load_dataset,DatasetDict,Dataset
from utils import read_jsonl
from evaluator import DPOModelEvaluator
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from typing import Any, List, Literal, Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling
import numpy as np
import os


#project_dir = os.path.dirname(os.path.abspath(os.getcwd()))
train_dataset_path="datasets/mcqa/mcqa_train_dataset.jsonl"
test_dataset_path="datasets/mcqa/mcqa_test_dataset_halved.jsonl"
validation_dataset_path="datasets/mcqa/mcqa_validation_dataset_halved.jsonl"

mcqa_dataset=load_dataset("json", data_files={"train":train_dataset_path, "validation": validation_dataset_path, "test":test_dataset_path})


In [30]:
def preprocess_function(examples):
    # print(examples)
    question=examples['question']
    answer=examples['answer']
    system=examples['subject']
    final={"message":[
        {"role": "system", "content": f"Write the letter corresponding to the correct answer"},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]}
    return final


new_dataset=DatasetDict()
for set in mcqa_dataset:
    data=mcqa_dataset[set]
    new_dataset[set]=data.map(preprocess_function)

Map:   0%|          | 0/1458 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

Map:   0%|          | 0/183 [00:00<?, ? examples/s]

In [31]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['subject', 'question', 'answer', 'message'],
        num_rows: 1458
    })
    validation: Dataset({
        features: ['subject', 'question', 'answer', 'message'],
        num_rows: 182
    })
    test: Dataset({
        features: ['subject', 'question', 'answer', 'message'],
        num_rows: 183
    })
})

In [32]:
train_dataset_mcqa = new_dataset["train"]
validation_dataset_mcqa = new_dataset["validation"]
test_dataset_mcqa = new_dataset["test"]

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name='AlexVal/dpo_model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="sdpa"
).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [33]:
train_dataset_mcqa = train_dataset_mcqa.remove_columns(['subject', 'question', 'answer'])
validation_dataset_mcqa = validation_dataset_mcqa.remove_columns(['subject', 'question', 'answer'])

In [34]:
#train_data
def apply_chat_template(
    example,
    tokenizer,
    task: Literal["sft", "generation"],
):
    if task in ["sft", "generation"]:
        messages = example["message"]
        example["text"] = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            #Alexander: Removing \n from the samples. Keeping the end of text token. 
            add_generation_prompt=True if task == "generation" else False,
        )[:-2]
    return example

train_data_mcqa_withtext_field = train_dataset_mcqa.map(
    apply_chat_template,
    fn_kwargs={
        "tokenizer": tokenizer,
        "task": "sft",
    },
    desc="Applying chat template",
)

validation_data_mcqa_withtext_field = validation_dataset_mcqa.map(
    apply_chat_template,
    fn_kwargs={
        "tokenizer": tokenizer,
        "task": "sft",
    },
    desc="Applying chat template",
)


Applying chat template:   0%|          | 0/1458 [00:00<?, ? examples/s]

Applying chat template:   0%|          | 0/182 [00:00<?, ? examples/s]

In [35]:
train_data_mcqa_withtext_field['text'][0]

'<|system|>\nWrite the letter corresponding to the correct answer<|endoftext|>\n<|user|>\nQuestion: Wendy bought 30 packs of gum. Each pack had 5 pieces. She multiplied 30 × 5 to find the number of pieces of gum she bought. How many pieces of gum did Wendy buy?\n\nOptions:\nA. 15\nB. 35\nC. 150\nD. 305\n\nAnswer:<|endoftext|>\n<|assistant|>\nC<|endoftext|'

In [36]:
validation_data_mcqa_withtext_field['message']

[[{'content': 'Write the letter corresponding to the correct answer',
   'role': 'system'},
  {'content': "Question: Which of the following is a correct statement?\n\nOptions:\nA. The probability of a Type II error does not depend on the probability of a Type I error.\nB. In conducting a hypothesis test, it is possible to simultaneously make both a Type I and a Type II error.\nC. A Type II error will result if one incorrectly assumes the data are normally distributed.\nD. When you choose a significance level α, you're setting the probability of a Type I error to exactly α.\n\nAnswer:",
   'role': 'user'},
  {'content': 'D', 'role': 'assistant'}],
 [{'content': 'Write the letter corresponding to the correct answer',
   'role': 'system'},
  {'content': 'Question: How many three-digit positive integers are there?\n\nOptions:\nA. 899\nB. 900\nC. 1000\nD. 999\n\nAnswer:',
   'role': 'user'},
  {'content': 'B', 'role': 'assistant'}],
 [{'content': 'Write the letter corresponding to the corre

In [37]:
train_data_mcqa_withtext_field.to_json('mcqa_train_dataset_chattemplate_mcqa.jsonl', orient="records")
validation_data_mcqa_withtext_field.to_json('mcqa_validation_dataset_chattemplate_mcqa.jsonl', orient="records")


Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

169012

In [41]:
def get_tokenized_length(example):
    lengths = len(tokenizer(example['text'])['input_ids'])
    #print(lengths)
    return {'length': lengths}


train_data_mcqa_withtext_field = train_data_mcqa_withtext_field.map(get_tokenized_length, batched=False)
validation_data_mcqa_withtext_field = validation_data_mcqa_withtext_field.map(get_tokenized_length, batched=False)


Map:   0%|          | 0/1458 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

In [42]:
print(np.max(train_data_mcqa_withtext_field['length']))
print(np.max(validation_data_mcqa_withtext_field['length']))

503
314


In [43]:
def count_values_greater_than_2048(values: List[int]) -> int:
    return sum(1 for value in values if value > 1024)
count_values_greater_than_2048(train_data_mcqa_withtext_field['length'])

0

In [9]:
mcqa_dataset['train'][0]

{'subject': 'elementary_mathematics',
 'question': 'Question: Wendy bought 30 packs of gum. Each pack had 5 pieces. She multiplied 30 × 5 to find the number of pieces of gum she bought. How many pieces of gum did Wendy buy?\n\nOptions:\nA. 15\nB. 35\nC. 150\nD. 305\n\nAnswer:',
 'answer': 'C',
 'message': [{'content': 'You are a helpful elementary_mathematics assistant.',
   'role': 'system'},
  {'content': 'Question: Wendy bought 30 packs of gum. Each pack had 5 pieces. She multiplied 30 × 5 to find the number of pieces of gum she bought. How many pieces of gum did Wendy buy?\n\nOptions:\nA. 15\nB. 35\nC. 150\nD. 305\n\nAnswer:',
   'role': 'user'},
  {'content': 'C', 'role': 'assistant'}]}

In [7]:
mcqa_dataset['train']['message']

[[{'content': 'You are a helpful elementary_mathematics assistant.',
   'role': 'system'},
  {'content': 'Question: Wendy bought 30 packs of gum. Each pack had 5 pieces. She multiplied 30 × 5 to find the number of pieces of gum she bought. How many pieces of gum did Wendy buy?\n\nOptions:\nA. 15\nB. 35\nC. 150\nD. 305\n\nAnswer:',
   'role': 'user'},
  {'content': 'C', 'role': 'assistant'}],
 [{'content': 'You are a helpful formal_logic assistant.', 'role': 'system'},
  {'content': 'Question:  Select the best translation into predicate logic.Some robbers steal money from a bank. (Bx: x is a bank; Mx: x is money; Rx: x is a robber; Sxyz: x steals y from z)\n\nOptions:\nA. (∃x){Bx • (∃y)[Ry • (∀z)(Mz ⊃ Syzx)]}\nB. (∃x)(∃y)(∃z)[(Rx • By) • (Mz • Syxz)]\nC. (∃x){Rx ⊃ (∃y)[My ⊃ (∃z)(Bz ⊃ Sxyz)]}\nD. (∃x){Rx • (∃y)[My • (∃z)(Bz • Sxyz)]}\n\nAnswer:',
   'role': 'user'},
  {'content': 'D', 'role': 'assistant'}],
 [{'content': 'You are a helpful college_computer_science assistant.',
   'role':

**Splitting test into val and test**

In [4]:
print("datasets/mcqa/mcqa_test_dataset.jsonl")

datasets/mcqa/mcqa_test_dataset.jsonl


In [6]:
data = read_jsonl("datasets/mcqa/mcqa_test_dataset.jsonl")

In [7]:
data

[{'subject': 'elementary_mathematics',
  'question': 'Question: What value of y makes y + 2.9 = 11 true?\n\nOptions:\nA. 8.1\nB. 8.9\nC. 9.1\nD. 13.9\n\nAnswer:',
  'answer': 'A'},
 {'subject': 'machine_learning',
  'question': 'Question: Statement 1| Since the VC dimension for an SVM with a Radial Base Kernel is infinite, such an SVM must be worse than an SVM with polynomial kernel which has a finite VC dimension. Statement 2| A two layer neural network with linear activation functions is essentially a weighted combination of linear separators, trained on a given dataset; the boosting algorithm built on linear separators also finds a combination of linear separators, therefore these two algorithms will give the same result.\n\nOptions:\nA. True, True\nB. False, False\nC. True, False\nD. False, True\n\nAnswer:',
  'answer': 'B'},
 {'subject': 'college_mathematics',
  'question': 'Question: How many real roots does the polynomial 2x^5 + 8x - 7 have?\n\nOptions:\nA. None\nB. One\nC. Two\

In [8]:
task_type = "causal_lm"
test_dataloader = DataLoader(data, batch_size=3)
evaluator = DPOModelEvaluator(
    task_type=task_type,
    policy_model_path="AlexVal/dpo_model",
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
#Alexander: Each batch is a dict of lists instead of list of dicts.
for batch in test_dataloader:
    print("BATCH")
    print(batch)
    print(len(batch))    
#    for example in batch:
#        print(example)

BATCH
{'subject': ['machine_learning', 'machine_learning', 'machine_learning'], 'question': ['Question: Statement 1| Linear regression estimator has the smallest variance among all unbiased estimators. Statement 2| The coefficients α assigned to the classifiers assembled by AdaBoost are always non-negative.\n\nOptions:\nA. True, True\nB. False, False\nC. True, False\nD. False, True\n\nAnswer:', 'Question: Statement 1| RoBERTa pretrains on a corpus that is approximate 10x larger than the corpus BERT pretrained on. Statement 2| ResNeXts in 2018 usually used tanh activation functions.\n\nOptions:\nA. True, True\nB. False, False\nC. True, False\nD. False, True\n\nAnswer:', 'Question: Statement 1| Support vector machines, like logistic regression models, give a probability distribution over the possible labels given an input example. Statement 2| We would expect the support vectors to remain the same in general as we move from a linear kernel to higher order polynomial kernels.\n\nOptions:\

In [8]:
#Read in test dataset as list of dicts and splitting into test and val data
test_dataset = read_jsonl("datasets/mcqa/mcqa_test_dataset.jsonl")
test_dataset

[{'subject': 'elementary_mathematics',
  'question': 'Question: What value of y makes y + 2.9 = 11 true?\n\nOptions:\nA. 8.1\nB. 8.9\nC. 9.1\nD. 13.9\n\nAnswer:',
  'answer': 'A'},
 {'subject': 'machine_learning',
  'question': 'Question: Statement 1| Since the VC dimension for an SVM with a Radial Base Kernel is infinite, such an SVM must be worse than an SVM with polynomial kernel which has a finite VC dimension. Statement 2| A two layer neural network with linear activation functions is essentially a weighted combination of linear separators, trained on a given dataset; the boosting algorithm built on linear separators also finds a combination of linear separators, therefore these two algorithms will give the same result.\n\nOptions:\nA. True, True\nB. False, False\nC. True, False\nD. False, True\n\nAnswer:',
  'answer': 'B'},
 {'subject': 'college_mathematics',
  'question': 'Question: How many real roots does the polynomial 2x^5 + 8x - 7 have?\n\nOptions:\nA. None\nB. One\nC. Two\

In [9]:
eval_data, test_data = train_test_split(test_dataset, test_size=0.5, random_state=42)

In [10]:
eval_data

[{'subject': 'high_school_statistics',
  'question': "Question: Which of the following is a correct statement?\n\nOptions:\nA. The probability of a Type II error does not depend on the probability of a Type I error.\nB. In conducting a hypothesis test, it is possible to simultaneously make both a Type I and a Type II error.\nC. A Type II error will result if one incorrectly assumes the data are normally distributed.\nD. When you choose a significance level α, you're setting the probability of a Type I error to exactly α.\n\nAnswer:",
  'answer': 'D'},
 {'subject': 'high_school_mathematics',
  'question': 'Question: How many three-digit positive integers are there?\n\nOptions:\nA. 899\nB. 900\nC. 1000\nD. 999\n\nAnswer:',
  'answer': 'B'},
 {'subject': 'computer_security',
  'question': 'Question: What are the types of scanning?\n\nOptions:\nA. Port, network, and services\nB. Network, vulnerability, and port \nC. Passive, active, and interactive\nD. Server, client, and network\n\nAnswer

In [11]:
#Converting back into huggingface format
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

test_dataset.to_json('mcqa_test_dataset_halved.jsonl', orient="records")
eval_dataset.to_json('mcqa_validation_dataset_halved.jsonl', orient="records")


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

68263