In [1]:
from preprocess import *
import os

In [2]:
# Dataset used for training
DATASETS = [
    "ai2_science_middle",
    "ai2_science_elementary",
    "arc_hard",
    "arc_easy",
    "mctest",
    "mctest_corrected_the_separator",
    "natural_questions",
    "quoref",
    "squad1_1",
    "squad2",
    "boolq",
    "multirc",
    "newsqa",
    "race_string",
    "ropes",
    "drop",
    "narrativeqa",
    "openbookqa",
    "qasc",
    "boolq_np",
    "contrast_sets_boolq",
    "contrast_sets_drop",
    "contrast_sets_quoref",
    "contrast_sets_ropes",
    "commonsenseqa",
    "qasc_with_ir",
    "openbookqa_with_ir",
    "arc_easy_with_ir",
    "arc_hard_with_ir",
    "ambigqa",
    "natural_questions_direct_ans",
    "natural_questions_with_dpr_para",
    "winogrande_xs",
    "winogrande_s",
    "winogrande_m",
    "winogrande_l",
    "winogrande_xl",
    "social_iqa",
    "physical_iqa",
]

UNIFIED_DATASETS = [ 
            "narrativeqa",
            "ai2_science_middle", "ai2_science_elementary",
            "arc_hard", "arc_easy",
            "mctest_corrected_the_separator",
            "squad1_1", "squad2",
            "boolq",
            "race_string",
            "openbookqa"]

# Unseen Data
EVAL_DATA = list(set(DATASETS) - set(UNIFIED_DATASETS))


In [3]:
print("Making UnifiedQA Train Dataset")
unified_qa_train = make_unified_qa_dataset(UNIFIED_DATASETS, r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\data", "train", True, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\unprocessed")

print("Making UnifiedQA Dev Dataset")
unified_qa_dev = make_unified_qa_dataset(UNIFIED_DATASETS, r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\data", "dev", True, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\unprocessed")

print("Making UnifiedQA Test Dataset")
unified_qa_test = make_unified_qa_dataset(UNIFIED_DATASETS, r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\data", "test", True, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\unprocessed")




Making UnifiedQA Train Dataset
Making UnifiedQA Dev Dataset
Making UnifiedQA Test Dataset


In [4]:
print('SANITY CHECK FOR READING TRAIN DATASET')
print(len(unified_qa_train['narrativeqa']['question']))
print('---------------------------------------------------------------')

print('SANITY CHECK FOR READING DEV DATASET')
print(len(unified_qa_dev['narrativeqa']['question']))
print('---------------------------------------------------------------')

print('SANITY CHECK FOR READING TEST DATASET')
print(len(unified_qa_test['narrativeqa']['question']))
print('---------------------------------------------------------------')

SANITY CHECK FOR READING TRAIN DATASET
65494
---------------------------------------------------------------
SANITY CHECK FOR READING DEV DATASET
6922
---------------------------------------------------------------
SANITY CHECK FOR READING TEST DATASET
21114
---------------------------------------------------------------


In [5]:
gemma_chat_preprocess_train = preprocess_unified_qa_dataset(datasets=unified_qa_train, append_instruction_gemma=True, append_instruction_llama=False, append_instruction_mistral=False, append_s=False, append_bos=True, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Gemma", file_name='train',enable_load=True)
gemma_chat_preprocess_dev = preprocess_unified_qa_dataset(datasets=unified_qa_dev, append_instruction_gemma=True, append_instruction_llama=False, append_instruction_mistral=False, append_s=False, append_bos=True, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Gemma", file_name='dev',enable_load=True)
gemma_chat_preprocess_test = preprocess_unified_qa_dataset(datasets=unified_qa_test, append_instruction_gemma=True, append_instruction_llama=False, append_instruction_mistral=False, append_s=False, append_bos=True, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Gemma", file_name='test',enable_load=True)

llama_chat_preprocess_train = preprocess_unified_qa_dataset(datasets=unified_qa_train, append_instruction_gemma=False, append_instruction_llama=True, append_instruction_mistral=False, append_s=True, append_bos=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Llama", file_name='train',enable_load=True)
llama_chat_preprocess_dev = preprocess_unified_qa_dataset(datasets=unified_qa_dev, append_instruction_gemma=False, append_instruction_llama=True, append_instruction_mistral=False, append_s=True, append_bos=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Llama", file_name='dev',enable_load=True)
llama_chat_preprocess_test = preprocess_unified_qa_dataset(datasets=unified_qa_test, append_instruction_gemma=False, append_instruction_llama=True, append_instruction_mistral=False, append_s=True, append_bos=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Llama", file_name='test',enable_load=True)

mistral_chat_preprocess_train = preprocess_unified_qa_dataset(datasets=unified_qa_train, append_instruction_gemma=False, append_instruction_llama=False, append_instruction_mistral=True, append_s=True, append_bos=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Mistral", file_name='train',enable_load=True)
mistral_chat_preprocess_dev = preprocess_unified_qa_dataset(datasets=unified_qa_dev, append_instruction_gemma=False, append_instruction_llama=False, append_instruction_mistral=True, append_s=True, append_bos=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Mistral", file_name='dev',enable_load=True)
mistral_chat_preprocess_test = preprocess_unified_qa_dataset(datasets=unified_qa_test, append_instruction_gemma=False, append_instruction_llama=False, append_instruction_mistral=True, append_s=True, append_bos=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Mistral", file_name='test',enable_load=True)



C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Gemma\train.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Gemma\dev.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Gemma\test.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Llama\train.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Llama\dev.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Llama\test.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Mistral\train.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Mistral\dev.json
C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\preprocessed\Mistral\test.json


In [6]:
print('SANITY CHECK FOR PREPROCESSING TRAIN DATASET')
print(gemma_chat_preprocess_train['narrativeqa']['question'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING DEV DATASET')
print(gemma_chat_preprocess_train['narrativeqa']['answer'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING TEST DATASET')
print(gemma_chat_preprocess_train['narrativeqa']['text'][0])
print('---------------------------------------------------------------')

SANITY CHECK FOR PREPROCESSING TRAIN DATASET
<bos><start_of_turn>user
who is miss delmer? \n  at madeline hall, an old mansion-house near southampton belonging to the wealthy de versely family, lives an elderly spinster miss delmar, the aunt of the earl de versely and captain delmar. miss delmar invites arabella mason, the daughter of a deceased, well-liked steward to stay with her as a lower-class guest in the house. captain delmar is known to visit his aunt at madeline hall frequently, accompanied by his valet ben keene, who is also a private marine. captain delmar eventually suggests that ben should propose to arabella, and the two marry in secret, to the frustration of miss delmar and arabella's mother. the captain is able to smooth over the situation with his aunt, even after it is discovered that arabella was six months pregnant at the time of the marriage. she later gives birth to a boy, who takes the captain's christian name and ben's surname--the titular percival keene.the fam

In [7]:
print('SANITY CHECK FOR PREPROCESSING TRAIN DATASET')
print(llama_chat_preprocess_train['narrativeqa']['question'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING DEV DATASET')
print(llama_chat_preprocess_train['narrativeqa']['answer'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING TEST DATASET')
print(llama_chat_preprocess_train['narrativeqa']['text'][0])
print('---------------------------------------------------------------')

SANITY CHECK FOR PREPROCESSING TRAIN DATASET
<s>who is miss delmer? \n  at madeline hall, an old mansion-house near southampton belonging to the wealthy de versely family, lives an elderly spinster miss delmar, the aunt of the earl de versely and captain delmar. miss delmar invites arabella mason, the daughter of a deceased, well-liked steward to stay with her as a lower-class guest in the house. captain delmar is known to visit his aunt at madeline hall frequently, accompanied by his valet ben keene, who is also a private marine. captain delmar eventually suggests that ben should propose to arabella, and the two marry in secret, to the frustration of miss delmar and arabella's mother. the captain is able to smooth over the situation with his aunt, even after it is discovered that arabella was six months pregnant at the time of the marriage. she later gives birth to a boy, who takes the captain's christian name and ben's surname--the titular percival keene.the family moves to chatham, 

In [8]:
print('SANITY CHECK FOR PREPROCESSING TRAIN DATASET')
print(llama_chat_preprocess_train['narrativeqa']['question'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING DEV DATASET')
print(llama_chat_preprocess_train['narrativeqa']['answer'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING TEST DATASET')
print(llama_chat_preprocess_train['narrativeqa']['text'][0])
print('---------------------------------------------------------------')

SANITY CHECK FOR PREPROCESSING TRAIN DATASET
<s>who is miss delmer? \n  at madeline hall, an old mansion-house near southampton belonging to the wealthy de versely family, lives an elderly spinster miss delmar, the aunt of the earl de versely and captain delmar. miss delmar invites arabella mason, the daughter of a deceased, well-liked steward to stay with her as a lower-class guest in the house. captain delmar is known to visit his aunt at madeline hall frequently, accompanied by his valet ben keene, who is also a private marine. captain delmar eventually suggests that ben should propose to arabella, and the two marry in secret, to the frustration of miss delmar and arabella's mother. the captain is able to smooth over the situation with his aunt, even after it is discovered that arabella was six months pregnant at the time of the marriage. she later gives birth to a boy, who takes the captain's christian name and ben's surname--the titular percival keene.the family moves to chatham, 

In [9]:
print('SANITY CHECK FOR PREPROCESSING TRAIN DATASET')
print(mistral_chat_preprocess_train['narrativeqa']['question'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING DEV DATASET')
print(mistral_chat_preprocess_train['narrativeqa']['answer'][0])
print('---------------------------------------------------------------')

print('SANITY CHECK FOR PREPROCESSING TEST DATASET')
print(mistral_chat_preprocess_train['narrativeqa']['text'][0])
print('---------------------------------------------------------------')

SANITY CHECK FOR PREPROCESSING TRAIN DATASET
<s>[INST] who is miss delmer? \n  at madeline hall, an old mansion-house near southampton belonging to the wealthy de versely family, lives an elderly spinster miss delmar, the aunt of the earl de versely and captain delmar. miss delmar invites arabella mason, the daughter of a deceased, well-liked steward to stay with her as a lower-class guest in the house. captain delmar is known to visit his aunt at madeline hall frequently, accompanied by his valet ben keene, who is also a private marine. captain delmar eventually suggests that ben should propose to arabella, and the two marry in secret, to the frustration of miss delmar and arabella's mother. the captain is able to smooth over the situation with his aunt, even after it is discovered that arabella was six months pregnant at the time of the marriage. she later gives birth to a boy, who takes the captain's christian name and ben's surname--the titular percival keene.the family moves to ch

In [10]:
from huggingface_hub import notebook_login
notebook_login() # use your access token here! 

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:


tokenized_gemma_train = load_dataset(gemma_chat_preprocess_train, "google/gemma-7b", pad_token=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Gemma", file_name='train',enable_load=True)
tokenized_gemma_dev = load_dataset(gemma_chat_preprocess_dev, "google/gemma-7b", pad_token=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Gemma", file_name='dev',enable_load=True)
tokenized_gemma_test = load_dataset(gemma_chat_preprocess_test, "google/gemma-7b", pad_token=False, file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Gemma", file_name='test',enable_load=True)

In [1]:
# not yet tested, waiting for approval from meta
tokenized_mistral_train = load_dataset(mistral_chat_preprocess_train, "mistralai/Mistral-7B-v0.1", pad_token=True, pad_side='right', file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Mistral", file_name='train',enable_load=True)
tokenized_mistral_dev = load_dataset(mistral_chat_preprocess_dev, "mistralai/Mistral-7B-v0.1", pad_token=True, pad_side='right', file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Mistral", file_name='dev',enable_load=True)
tokenized_mistral_test = load_dataset(mistral_chat_preprocess_test, "mistralai/Mistral-7B-v0.1", pad_token=True, pad_side='right', file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Mistral", file_name='test',enable_load=True)

NameError: name 'load_dataset' is not defined

In [11]:
# not yet tested, waiting for approval from meta
tokenized_llama_train = load_dataset(llama_chat_preprocess_train, "meta-llama/Llama-2-7b-hf", pad_token=True, pad_side='right', file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Llama", file_name='train',enable_load=True)
tokenized_llama_dev = load_dataset(llama_chat_preprocess_dev, "meta-llama/Llama-2-7b-hf", pad_token=True, pad_side='right', file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Llama", file_name='dev',enable_load=True)
tokenized_llama_test = load_dataset(llama_chat_preprocess_test, "meta-llama/Llama-2-7b-hf", pad_token=True, pad_side='right', file_path=r"C:\Programming\DL Research\Efficient-LLM-Benchmark\UnifiedQA Data Curation\tokenized\Llama", file_name='test',enable_load=True)

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]