In [1]:
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset, load_from_disk, Dataset, concatenate_datasets
import pandas as pd
from transformers import ( 
                        Trainer,
                        TrainingArguments,
                        AutoTokenizer,
                        AutoConfig,
                        AutoModelForCausalLM,
                        AutoModelForMultipleChoice,
                        AutoModelForSeq2SeqLM,
                        default_data_collator,
                        get_linear_schedule_with_warmup)
  
# model_path = "vinai/PhoGPT-7B5-Instruct" 
mmlu_dataset = load_dataset("lukaemon/mmlu", "elementary_mathematics", cache_dir="./cache")
mathqa_dataset = load_dataset("math_qa", cache_dir="./cache")
merged_dataset = mmlu_dataset["train"]
for subset_name, subset_data in mmlu_dataset.items():
    if subset_name != 'train':
        merged_dataset = concatenate_datasets([merged_dataset, subset_data])

eng_dataset = load_from_disk("processed_train_eng.hf")
eng_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 1080
    })
    test: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 120
    })
})

In [2]:
mathqa_concat_dataset = mathqa_dataset["train"]
for subset_name, subset_data in mathqa_dataset.items():
    if subset_name != 'train':
        mathqa_concat_dataset = concatenate_datasets([mathqa_concat_dataset, subset_data])

In [3]:
mathqa_concat_dataset[5]

{'Problem': 'an empty fuel tank with a capacity of 218 gallons was filled partially with fuel a and then to capacity with fuel b . fuel a contains 12 % ethanol by volume and fuel b contains 16 % ethanol by volume . if the full fuel tank contains 30 gallons of ethanol , how many gallons of fuel a were added ?',
 'Rationale': '"say there are a gallons of fuel a in the tank , then there would be 218 - a gallons of fuel b . the amount of ethanol in a gallons of fuel a is 0.12 a ; the amount of ethanol in 218 - a gallons of fuel b is 0.16 ( 218 - a ) ; since the total amount of ethanol is 30 gallons then 0.12 a + 0.16 ( 218 - a ) = 30 - - > a = 122 . answer : a ."',
 'options': 'a ) 122 , b ) 150 , c ) 100 , d ) 80 , e ) 50',
 'correct': 'a',
 'annotated_formula': 'divide(subtract(multiply(218, divide(16, const_100)), 30), subtract(divide(16, const_100), divide(12, const_100)))',
 'linear_formula': 'divide(n2,const_100)|divide(n1,const_100)|multiply(n0,#0)|subtract(#0,#1)|subtract(#2,n3)|di

In [4]:
mathqa_concat_dataset = mathqa_concat_dataset.rename_columns({
    "Problem": "question",
    "Rationale": "explanation",
    "correct": "clean_answer",
    "options": "choices"
})

mathqa_concat_dataset = mathqa_concat_dataset.remove_columns(column_names=["annotated_formula", "linear_formula", "category"])
mathqa_concat_dataset

Dataset({
    features: ['question', 'explanation', 'choices', 'clean_answer'],
    num_rows: 37297
})

In [5]:
mathqa_concat_dataset[63]

{'question': 'a small table has a length of 12 inches and a breadth of b inches . cubes are placed on the surface of the table so as to cover the entire surface . the maximum side of such cubes is found to be 4 inches . also , a few such tables are arranged to form a square . the minimum length of side possible for such a square is 80 inches . find b .',
 'explanation': 'from the info that the maximum sides of the cubes is 4 , we know that the gcf of 12 ( = 2 ^ 2 * 3 ) andbis 4 ( = 2 ^ 2 ) , sob = 2 ^ x , where x > = 2 . from the second premise , we know that the lcm of 12 ( 2 ^ 2 * 3 ) andbis 80 ( 2 ^ 4 * 5 ) , sob = 2 ^ 4 or 2 ^ 4 * 5 ( 16 or 80 ) . combining 2 premises shows the answer is b ( 16 ) .',
 'choices': "['a ) 8', 'b ) 16', 'c ) 24', 'd ) 32', 'e ) 48']",
 'clean_answer': 'b'}

In [6]:
set(mathqa_concat_dataset["clean_answer"])

{'a', 'b', 'c', 'd', 'e'}

In [7]:
import re
import random

choices = {choice: i for i, choice in enumerate("ABCD")} 
idx2choices = {i: choice for i, choice in enumerate("ABCD")} 
pattern = re.compile("[abcde] \)")
def process_options(example):
    options = example["choices"].split(", ")
    example["choices"] = options
    if example["clean_answer"] == "e":
        idx = random.randint(0, 3)
        options[idx], options[-1] = options[-1], options[idx]
        example["clean_answer"] = idx2choices[idx]
    for choice, i in choices.items():
        example[choice] = re.sub(pattern, "", options[i]).replace("'", "").replace("]", "").replace("[", "").strip()
    
    example["clean_answer"] = example["clean_answer"].upper() 
    return example

mathqa_concat_dataset = mathqa_concat_dataset.map(process_options)
mathqa_concat_dataset

Dataset({
    features: ['question', 'explanation', 'choices', 'clean_answer', 'A', 'B', 'C', 'D'],
    num_rows: 37297
})

In [8]:
set(mathqa_concat_dataset["clean_answer"])

{'A', 'B', 'C', 'D'}

In [9]:
merged_dataset = merged_dataset.rename_columns({
    "input": "question",
    "target": "clean_answer",
})
merged_dataset = merged_dataset.add_column("explanation", [""] * len(merged_dataset))
merged_dataset

Dataset({
    features: ['question', 'A', 'B', 'C', 'D', 'clean_answer', 'explanation'],
    num_rows: 421
})

In [10]:
eng_dataset["train"] = concatenate_datasets([eng_dataset["train"], merged_dataset, mathqa_concat_dataset])
eng_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 38798
    })
    test: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 120
    })
})

In [11]:
device = "cuda:0"
model_path = "microsoft/deberta-v3-large"
model = AutoModelForMultipleChoice.from_pretrained(model_path, cache_dir="./cache")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, cache_dir="./cache")  
model.to(device)

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForMultipleChoice(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True

In [12]:
model.config.vocab_size

128100

In [13]:
old_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, cache_dir="./cache")  

def get_training_corpus():
    dataset = eng_dataset["train"]
    for start_idx in range(0, len(dataset), 200):
        samples = dataset[start_idx : start_idx + 200]
        yield " ".join(["".join([samples[col][i] for col in ["question", "explanation", "A", "B", "C", "D"]])
                        for i in range(len(samples) - 1)])

training_corpus = get_training_corpus()
my_tokenizer = tokenizer.train_new_from_iterator(training_corpus, 140000)





In [14]:
new_tokens = set(my_tokenizer.vocab.keys()) - set(tokenizer.vocab.keys())

# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(my_tokenizer.vocab))

# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))

Embedding(131442, 1024)

In [15]:
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 256
option_to_index = {option: idx for idx, option in enumerate('ABCD')}
index_to_option = {v: k for k,v in option_to_index.items()}
def preprocess(example):
    # nums = len(example["choices"])
    nums = 4
    choices = "ABCD" if nums == 4 else "ABC"
    explain = "" if example['explanation'] is None else example['explanation']
    first_sentence = [ "[CLS] " + explain] * nums
    second_sentences = [" #### " + example['question'] + " [SEP] " + example[option] + " [SEP]" for option in choices]
    tokenized_example = tokenizer(first_sentence, second_sentences, 
                                  truncation='only_first' 
                                  if len(second_sentences[0]) < len(first_sentence[0]) else "only_second", 
                                  max_length=MAX_INPUT, add_special_tokens=False, padding="max_length")
    tokenized_example['label'] = option_to_index[example['clean_answer']]
    
    return tokenized_example

In [16]:
eng_dataset["train"][677]

{'question': 'Every grocery store offers two types of water, small bottles of 0.45 liters, large bottles of 0.75 liters. The store offers 20 small bottles and 15 large bottles. The store has provided the number of liters of water:',
 'choices': ['A. 20,25 lít', 'B. 19,75 lít', 'C. 21,75 lít', 'D. 22,15 lít'],
 'explanation': "It's not like we're going to have to do this.",
 'answer': 'A. 20,25 lít',
 'clean_choices': ['20,25 lít', '19,75 lít', '21,75 lít', '22,15 lít'],
 'clean_answer': 'A',
 'A': '20.25 liters',
 'B': '19.75 liters',
 'C': '21.75 liters',
 'D': '22.15 liters',
 'id': '890'}

In [17]:
tokenized_dataset = eng_dataset.map(preprocess, remove_columns=eng_dataset["train"].column_names, batch_size=16)

Map: 100%|██████████| 38798/38798 [02:10<00:00, 297.80 examples/s]
Map: 100%|██████████| 120/120 [00:00<00:00, 216.90 examples/s]


In [18]:
# from peft import   (get_peft_config, 
#                     get_peft_model, 
#                     PromptTuningInit, 
#                     PromptTuningConfig, 
#                     TaskType, 
#                     PeftType, 
#                     LoraConfig)

# peft_config = LoraConfig(
#     r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, 
#     bias="none", inference_mode=False, 
#     target_modules=["query_proj", "value_proj"],
#     modules_to_save=['classifier','pooler'],
# )
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

# # DEBERTA LARGE HAS TOTAL OF 24 LAYERS
# FREEZE_LAYERS = 18
# # BOOLEAN TO FREEZE EMBEDDINGS
# FREEZE_EMBEDDINGS = True

# if FREEZE_EMBEDDINGS:
#     print('Freezing embeddings.')
#     for param in model.deberta.embeddings.parameters():
#         param.requires_grad = False
# if FREEZE_LAYERS>0:
#     print(f'Freezing {FREEZE_LAYERS} layers.')
#     for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
#         for param in layer.parameters():
#             param.requires_grad = False

In [19]:
len(model.deberta.encoder.layer)

24

In [20]:
import numpy as np
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids
    return {
        "accuracy": sum(predictions.argmax(axis=1) == labels) / len(labels), 
        "map@3": map_at_3(predictions.tolist(), labels.tolist())}

In [21]:
VER = 5

training_args = TrainingArguments(
    warmup_ratio=0.1, 
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    report_to='none',
    output_dir = f'./checkpoints/checkpoints_{VER}',
    overwrite_output_dir=True,
    # fp16=True,
    gradient_accumulation_steps=8,
    logging_steps=25,
    evaluation_strategy='steps',
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=2,
)

In [22]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch


In [23]:
from transformers import EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)],
)

trainer.train()
trainer.save_model(f'./best_model/model_v{VER}')

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss




In [24]:
trainer = Trainer(model=model)

In [25]:
import numpy as np
test_predictions = trainer.predict(tokenized_dataset["test"]).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCD'))[predictions_as_ids]
# predictions_as_string = test_df['prediction'] = [
#     ' '.join(row) for row in predictions_as_answer_letters[:, :3]
# ]

sum([index_to_option[i] for i in tokenized_dataset["test"]["label"]] == predictions_as_answer_letters[:, :1].squeeze())



50

In [26]:
len(tokenized_dataset["test"]["label"])

120

In [27]:
import numpy as np
test_predictions = trainer.predict(tokenized_dataset["test"]).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCD'))[predictions_as_ids]

sum([index_to_option[i] for i in tokenized_dataset["test"]["label"]] == predictions_as_answer_letters[:, :1].squeeze())

50

In [28]:
from transformers import ( 
                        AutoTokenizer,
                        AutoModelForSeq2SeqLM)
import torch
import re
import json 

test = json.loads(open('data/math_test.json').read())["data"]
test_df = pd.DataFrame(test)
test_df.set_index("id", inplace=True)

pattern = re.compile("[ABCD].")

def process(texts):
    return [pattern.split(text)[-1].strip() for text in texts]
test_df["clean_choices"] = test_df["choices"].apply(process)

choices = {choice: i for i, choice in enumerate("ABCD")} 

def make_choice(df, choice):
    idx = choices[choice]
    df[choice] = df["clean_choices"].apply(lambda x: x[idx] if idx < len(x) else "")
    return df
for choice in choices.keys():
    make_choice(test_df, choice)

test_dataset = Dataset.from_pandas(test_df)


tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en-v2", src_lang="vi_VN", cache_dir="./cache")
model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en-v2", cache_dir="./cache")
device_vi2en = torch.device("cuda")
model_vi2en.to(device_vi2en)

def translate_vi2en(example) -> str:
    for col in ["question", "A", "B", "C", "D"]:
        input_ids = tokenizer_vi2en(example[col], padding=True, return_tensors="pt").to(device_vi2en)
        output_ids = model_vi2en.generate(
            **input_ids,
            decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"],
            num_return_sequences=1,
            num_beams=5,
            early_stopping=True
        )
        example[col] = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True)
        # Free GPU memory
        del input_ids
        del output_ids
        torch.cuda.empty_cache()
    return example

test_dataset = test_dataset.map(translate_vi2en, batched=True, batch_size=32)

Map: 100%|██████████| 189/189 [00:19<00:00,  9.72 examples/s]


In [29]:
def preprocess(example):
    # nums = len(example["choices"])
    nums = 4
    choices = "ABCD" if nums == 4 else "ABC"
    first_sentence = [example['question']] * nums
    second_sentences = [example[option] for option in choices]
    tokenized_example = tokenizer(first_sentence, second_sentences, 
                                  truncation=True,max_length=MAX_INPUT, padding="max_length") 
                                #   if len(second_sentences[0]) < len(first_sentence[0]) else "only_second", 
                                #   max_length=MAX_INPUT, add_special_tokens=False, padding="max_length")
    # tokenized_example['label'] = option_to_index[example['clean_answer']]
    
    return tokenized_example

tokenized_test_dataset = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

Map: 100%|██████████| 189/189 [00:00<00:00, 546.51 examples/s]


In [30]:
tokenized_test_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 189
})

In [31]:
import numpy as np
test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCD'))[predictions_as_ids]

In [32]:
# test_df.drop(columns=test_df.columns, inplace=True)
test_df["id_ans"] = predictions_as_ids.squeeze().tolist()
test_df

Unnamed: 0_level_0,question,choices,clean_choices,A,B,C,D,id_ans
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01-0203,Một cửa hàng đã bán 30% số hàng hiện có và thu...,"[A. 4 500 000 đồng, B. 45 000 000 đồng, C. 50 ...","[4 500 000 đồng, 45 000 000 đồng, 50 000 000 đ...",4 500 000 đồng,45 000 000 đồng,50 000 000 đồng,450 000 000 đồng,"[0, 3, 1, 2]"
01-0206,Một người đi xe đạp từ A lúc 7 giờ với vận tốc...,"[A. 24 phút, B. 1 giờ, C. 7 giờ 24 phút, D. 8 ...","[24 phút, 1 giờ, 7 giờ 24 phút, 8 giờ 24 phút]",24 phút,1 giờ,7 giờ 24 phút,8 giờ 24 phút,"[1, 3, 2, 0]"
01-0207,Cạnh của hình lập phương gấp lên 2 lần thì diệ...,"[A. 2 lần, B. 4 lần, C. 6 lần, D. 8 lần]","[2 lần, 4 lần, 6 lần, 8 lần]",2 lần,4 lần,6 lần,8 lần,"[1, 2, 3, 0]"
01-0209,"Một thửa ruộng hình thang có đáy bé dài 8m, đá...","[A. 125m^{2}, B. 20%, C. 25%, D. 50%]","[125m^{2}, 20%, 25%, 50%]",125m^{2},20%,25%,50%,"[2, 3, 1, 0]"
01-0210,Một xe lửa vượt qua cái cầu dài 450m hết 45 gi...,"[A. 3m, B. 200m, C. 200m, D. 225m]","[3m, 200m, 200m, 225m]",3m,200m,200m,225m,"[3, 1, 2, 0]"
...,...,...,...,...,...,...,...,...
01-0698,"Kết quả phép nhân 4,51 \times 10 là:","[A. 451, B. 4,51, C. 45,1, D. 45]","[451, 4,51, 45,1, 45]",451,451,451,45,"[3, 1, 2, 0]"
01-0703,"Lớp 5/2 có 32 học sinh, trong đó có 12 học sin...","[A. 375 %, B. 37,5 %, C. 3,75 %, D. 0,375 %]","[375 %, 37,5 %, 3,75 %, 0,375 %]",375 %,"37,5 %","3,75 %","0,375 %","[1, 2, 0, 3]"
01-0715,Số thập phân thích hợp để điền vào chỗ chấm: 4...,"[A. 45,62, B. 4,562, C. 456,2, D. 4562]","[45,62, 4,562, 456,2, 4562]",4562,4562,4562,4562,"[3, 2, 0, 1]"
01-0716,"Kết quả của X trong biểu thức: X \div 2,04 = ...","[A. 3,03, B. 3,04, C. 3,05, D. 3,06]","[3,03, 3,04, 3,05, 3,06]",303,304,305,306,"[2, 3, 1, 0]"


In [33]:
test_df["answer"] = test_df.apply(lambda x: [x["choices"][int(i)] for i in x["id_ans"] if int(i) < len(x["choices"])][0], axis=1)
test_df.drop(columns=["question", "choices", "clean_choices", "A", "B", "C", "D", "id_ans"], inplace=True)
test_df

Unnamed: 0_level_0,answer
id,Unnamed: 1_level_1
01-0203,A. 4 500 000 đồng
01-0206,B. 1 giờ
01-0207,B. 4 lần
01-0209,C. 25%
01-0210,D. 225m
...,...
01-0698,D. 45
01-0703,"B. 37,5 %"
01-0715,D. 4562
01-0716,"C. 3,05"


In [34]:
test_df.to_csv(f"./submissions/submission_{VER}.csv")

In [35]:
# model = AutoModelForMultipleChoice.from_pretrained(model_path)
# model = get_peft_model(model, peft_config)
# checkpoint = torch.load(f'model_v{VER}/pytorch_model.bin')
# model.load_state_dict(checkpoint)