In [1]:
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset, load_from_disk, Dataset, concatenate_datasets
import pandas as pd
import random
from transformers import ( 
                        Trainer,
                        TrainingArguments,
                        AutoTokenizer,
                        AutoConfig,
                        AutoModel,
                        AutoModelForCausalLM,
                        AutoModelForMultipleChoice,
                        AutoModelForSeq2SeqLM,
                        default_data_collator,
                        get_linear_schedule_with_warmup)
import warnings
warnings.filterwarnings("ignore")
  

eng_dataset = load_from_disk("processed.hf")
eng_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 1080
    })
    test: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 120
    })
})

In [13]:
from transformers import AutoModel, OPTForCausalLM
device = "cuda:0"
model_path = "facebook/galactica-6.7b"
# model = AutoModelForMultipleChoice.from_pretrained(model_path, cache_dir="./cache")
model = OPTForCausalLM.from_pretrained(model_path, cache_dir="./cache")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, cache_dir="./cache")  
model.to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.15s/it]


OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50000, 4096, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=4096, out_features=4096, bias=True)
            (v_proj): Linear(in_features=4096, out_features=4096, bias=True)
            (q_proj): Linear(in_features=4096, out_features=4096, bias=True)
            (out_proj): Linear(in_features=4096, out_features=4096, bias=True)
          )
          (activation_fn): GELUActivation()
          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=4096, out_features=16384, bias=True)
          (fc2): Linear(in_features=16384, out_features=4096, bias=True)
          (final_layer_norm): La

In [14]:
model.config.vocab_size

50000

In [15]:
# old_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, cache_dir="./cache")  

# def get_training_corpus():
#     dataset = eng_dataset["train"]
#     for start_idx in range(0, len(dataset), 200):
#         samples = dataset[start_idx : start_idx + 200]
#         yield " ".join(["".join([samples[col][i] for col in ["question", "explanation", "A", "B", "C", "D"]])
#                         for i in range(len(samples) - 1)])

# training_corpus = get_training_corpus()
# my_tokenizer = tokenizer.train_new_from_iterator(training_corpus, 140000)

In [16]:
# new_tokens = set(my_tokenizer.vocab.keys()) - set(tokenizer.vocab.keys())

# # add the tokens to the tokenizer vocabulary
# tokenizer.add_tokens(list(my_tokenizer.vocab))

# # add new, random embeddings for the new tokens
# model.resize_token_embeddings(len(tokenizer))

In [17]:
eng_dataset["train"][38000:38010]

{'question': ['65 % of x = 20 % of 682.50 . find the value of x ?',
  'a palindrome is a number that reads the same front - to - back as it does back - to - front ( e . g . 202 , 575 , 1991 , etc . ) p is the smallest integer greater than 200 that is both a prime and a palindrome . what is the sum of the digits of p ?',
  'a car gets 40 kilometers per gallon of gasoline . how many gallons of gasoline would the car need to travel 120 kilometers ?',
  'a ranch has both horses and ponies . exactly 5 / 7 of the ponies have horseshoes , and exactly 2 / 3 of the ponies with horseshoes are from iceland . if there are 4 more horses than ponies , what is the minimum possible combined number of horses and ponies on the ranch ?',
  'a fruit seller had some oranges . he sells 40 % oranges and still has 420 oranges . how many oranges he had originally ?',
  'if x ^ 2 + y ^ 2 = 16 and xy = 3 , then ( x − y ) ^ 2 =',
  'on the first day of her vacation , louisa traveled 160 miles . on the second day 

In [18]:
from transformers import ( 
                        AutoTokenizer,
                        AutoModelForSeq2SeqLM)
import torch
import re
import json 

test = json.loads(open('data/math_test.json').read())["data"]
test_df = pd.DataFrame(test)
test_df.set_index("id", inplace=True)

pattern = re.compile("[ABCD].")

def process(texts):
    return [pattern.split(text)[-1].strip() for text in texts]
test_df["clean_choices"] = test_df["choices"].apply(process)

choices = {choice: i for i, choice in enumerate("ABCD")} 

def make_choice(df, choice):
    idx = choices[choice]
    df[choice] = df["clean_choices"].apply(lambda x: x[idx] if idx < len(x) else "")
    return df
for choice in choices.keys():
    make_choice(test_df, choice)

test_dataset = Dataset.from_pandas(test_df)


tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en-v2", src_lang="vi_VN", cache_dir="./cache")
model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en-v2", cache_dir="./cache")
device_vi2en = torch.device("cuda")
model_vi2en.to(device_vi2en)

def preprocess_text(text):
    number_replacements = []
    def replace_numbers(match):
        number = match.group(0)
        number_replacements.append(number)
        return "NUMBER"

    # Replace numbers with a placeholder
    text = re.sub(r'(\d+)', replace_numbers, text)
    return text, number_replacements

def restore_numbers(translated_text, number_replacements):
    for number in number_replacements:
        translated_text = translated_text.replace("NUMBER", number, 1)
    return translated_text

def translate_vi2en(example) -> str:
    # TODO: không dịch các con số (trong cả options và questions) vì có thể bị nhiễu
    for col in ["question"]:
        # preprocessed_text, number_replacements = preprocess_text(example[col])
        input_ids = tokenizer_vi2en(example[col], padding=True, return_tensors="pt").to(device_vi2en)
        output_ids = model_vi2en.generate(
            **input_ids,
            decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"],
            num_return_sequences=1,
            num_beams=5,
            early_stopping=True
        )
        example[col] = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True)
        # example[col] = restore_numbers(example[col], number_replacements)
        # Free GPU memory
        del input_ids
        del output_ids
        torch.cuda.empty_cache()
    return example

test_dataset = test_dataset.map(translate_vi2en, batched=True, batch_size=32)

Map: 100%|██████████| 189/189 [00:11<00:00, 16.06 examples/s]


In [19]:
del model_vi2en
del tokenizer_vi2en
torch.cuda.empty_cache()

In [20]:
test_dataset["choices"][:10]

[['A. 4 500 000 đồng',
  'B. 45 000 000 đồng',
  'C. 50 000 000 đồng',
  'D. 450 000 000 đồng'],
 ['A. 24 phút', 'B. 1 giờ', 'C. 7 giờ 24 phút', 'D. 8 giờ 24 phút'],
 ['A. 2 lần', 'B. 4 lần', 'C. 6 lần', 'D. 8 lần'],
 ['A. 125m^{2}', 'B. 20%', 'C. 25%', 'D. 50%'],
 ['A. 3m', 'B. 200m', 'C. 200m', 'D. 225m'],
 ['A. 5,612', 'B. 5,621', 'C. 5,216', 'D. 5,126'],
 ['A. 40%', 'B. 4%', 'C. 21%', 'D. 49%'],
 ['A. 7 giờ 17 phút',
  'B. 7 giờ 77 phút',
  'C. 8 giờ 17 phút',
  'D. 7 giờ 77 phút'],
 ['A. 75', 'B. 0,7', 'C. 0,75', 'D. 7,5'],
 ['A. 67,910', 'B. 679', 'C. 67,919', 'D. 6,7919']]

In [21]:
def preprocess_text(text):
    number_replacements = []
    def replace_numbers(match):
        number = match.group(0)
        number_replacements.append(number)
        return "NUMBER"

    # Replace numbers with a placeholder
    text = re.sub(r'(\d+)', replace_numbers, text)
    return text, number_replacements

print(preprocess_text(test_dataset["choices"][9][0]))
print(test_dataset["choices"][9][0])

('A. NUMBER,NUMBER', ['67', '910'])
A. 67,910


In [22]:
# eng_dataset = load_from_disk("processed_train_eng.hf")

In [23]:
# # TODO: Dịch những từ trong vocab và map trở lại
# vocab = []
# for choices in eng_dataset["train"]["choices"]:
#     for choice in choices:
#         for word in choice.split():
#             if not word.isdigit() and not any(i.isdigit() for i in word):
#                 vocab.append(word)
#                 # if word == "đỏ":
#                 #     print(choices)
            
# set(vocab)

In [24]:
# re.findall("([a-z]+)", test_dataset["choices"][0][0].split())

---

In [25]:
from transformers import AutoTokenizer, TextStreamer
model_name = "Intel/neural-chat-7b-v3"     
prompt = """A train is walking at 6 / 7 of its usual speed, the train is 20 minutes too late. find its usual time to cover the journey. /
\nA. 2 hours\nB. 30 min\nC. 3 hours 20 min\nD. 2 hours 30 min"""

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir="./cache/")
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda:1")
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="./cache/")
model.to("cuda:1")
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.06s/it]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> A train is walking at 6 / 7 of its usual speed, the train is 20 minutes too late. find its usual time to cover the journey. /

A. 2 hours
B. 30 min
C. 3 hours 20 min
D. 2 hours 30 min

Let the usual time to cover the journey be x hours.

According to the given condition, the train is walking at 6/7 of its usual speed and it is 20 minutes too late.

So, the effective speed = (6/7) * usual speed

Time taken to cover the journey = x + 20/60 (as 20 minutes = 20/60 hours)

According to the speed-time formula,

(6/7) * usual speed = (x + 20/60) ^ -1

(6/7) * usual speed = (x + 1/3) ^ -1

(6/7) * usual speed = (1/x) + (1/3x)

(6/7) * usual speed = (4x + 3x) / (3x)

(6/7) * usual speed = 7x / (3x)

usual speed = (7 * 3) / 6

usual speed = 7

Now, we have the usual speed, x can be calculated.

x = 1 / (6/7)

x = 7 / 6

x = 1.1666 hours (approximately)

So, the usual time


In [26]:
def make_prompt(question, choices, answer=None):
    ans = answer if answer else ""
    return f"""Question: {question}\
\n{choices}\nAnswer: {ans}"""

def _few_shot(sample, shots, k):
    def _make_choices(shot):
        choices = [shot[i] for i in "ABCD"]
        # Prepare multiple-choice input
        choices = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
        return choices
    return "\n".join([make_prompt(shots["question"][i], _make_choices(shots[i]), shots["answer"][i]) 
                   for i in range(k)] + [make_prompt(sample["question"], _make_choices(sample))]) 
    
print(_few_shot(eng_dataset["train"][10], eng_dataset["train"], 3))

Question: “42 months =.........year” The appropriate number of electricity put into place is:
A. 4
B. 4,2
C. 3,5
D. 35
Answer: C. 3,5
Question: The numbers of 6 hundreds and 3 units are:
A. 63
B. 36
C. 630
D. 603
Answer: D. 603
Question: "The area of a square with sides of 1.2dm is equal to.........dm3." The number of points to be inserted is:
A. 1,44
B. 1,728
C. 8,64
D. 5,76
Answer: B. 1,728
Question: That's 4 times less than 84.
A. 80
B. 41
C. 21
D. 20
Answer: 


In [27]:
# idx = random.randint(0, 1200) # len(eng_dataset["train"])
count = 0
num_samples = 1200
answers = []

# TODO: Kiểm tra các sample bị sai, vì sao sai?
bar = tqdm(range(0, num_samples))
for idx in bar:
    sample = eng_dataset["train"][idx]
    prompt = _few_shot(sample, eng_dataset["train"], 3)

    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda:1")
    # outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
    outputs = model.generate(inputs, do_sample=False, return_dict_in_generate=True, 
                             max_new_tokens=300, pad_token_id=tokenizer.eos_token_id)
    # print(tokenizer.decode(outputs.sequences[0]))
    answer = tokenizer.decode(outputs.sequences[0]).split("Answer: ")[-1] \
        .replace("</s>", "").replace("\n", "")
    
    answers.append(answer)
    if answer != None and (answer.split(".")[0] == sample["clean_answer"] or answer in sample["answer"]):
        count+=1
    # print(f'### ground truth: {eng_dataset["train"]["clean_answer"][idx]}')
    
    bar.set_postfix({"Accuracy" : count / (idx + 1)})

accuracy = count/num_samples
print(accuracy)

100%|██████████| 1200/1200 [16:00<00:00,  1.25it/s, Accuracy=0.428]

0.42833333333333334





In [28]:
answers

['3.5',
 '603',
 '1,728',
 'D. Ngày 4 tháng 6',
 '3,48',
 '4',
 '897654',
 '39 cách',
 '730m',
 'C. $\\frac{8}{10000}$',
 '20',
 '2,1',
 '52 000 kg',
 '999',
 '37,5 dm2',
 '30 046',
 '145 000',
 '6630 sản phẩm',
 '4936',
 '7,358',
 '42 759',
 '200 lần',
 '10 000 đồng',
 'B. t máy, cặp sách',
 '108 cm2',
 'B. Hàng phần nghìn',
 'Đề-ca-mét',
 '38',
 '11',
 '12,4',
 '15 phút',
 '15,65',
 '56',
 'C. 90',
 '3 ngày',
 '8451',
 '3',
 '25',
 'C. Số nghịch đảo của \\frac{-2}{3} là \\frac{-3}{-2}',
 '74,75',
 '62',
 '8km2',
 '18',
 '61,3',
 '832500',
 '4 500 ${\\times}$ 3',
 '6km',
 '50%',
 '62 059',
 '16 cm',
 '364 + 152',
 '103',
 '31,5',
 '5%',
 '9',
 '7206',
 '13 chiếc ti vi',
 '16 000 đồng',
 '7',
 '20 và 5',
 '0,045',
 '4, 5, 2, 3',
 '15 000 đồng',
 '194254',
 '1 234; 1 257; 5 617; 3 870',
 '6 lần, 9 lần',
 '60%',
 '31,10 lít',
 '24,66 m',
 '0,0015',
 '30 phút',
 '71 935',
 '10; 14; 16; 18',
 'D. Một trăm năm chục xăng ti mét vuông',
 '18 bạn',
 '0,5 m',
 '528',
 'B. Ngày 31 tháng 2',

In [36]:
test_ref = []
def inference(example):
    prompt = _few_shot(example, eng_dataset["train"], 3)

    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda:1")
    # outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
    outputs = model.generate(inputs, return_dict_in_generate=True, 
                             max_new_tokens=300, pad_token_id=tokenizer.eos_token_id)
    # print(tokenizer.decode(outputs.sequences[0]))
    answer = tokenizer.decode(outputs.sequences[0]).split("Answer: ")[-1] \
        .replace("</s>", "").replace("\n", "")
    
    test_ref.append(answer)
    example["answer"] = None
    for e in example["choices"]: 
        if answer.split(".")[0] in e or answer in e:
            example["answer"] = e
            break
    # print(f'### ground truth: {eng_dataset["train"]["clean_answer"][idx]}')
    return example

test_dataset = test_dataset.map(inference, batch_size=32)

Map: 100%|██████████| 189/189 [02:18<00:00,  1.37 examples/s]


In [38]:
# test_dataset.to_csv("inference.csv")
test_dataset.save_to_disk("inference.hf")

Saving the dataset (1/1 shards): 100%|██████████| 189/189 [00:00<00:00, 3221.34 examples/s]


---

In [23]:
first_question = eng_dataset["train"]["question"][0]
example = [eng_dataset["train"][i][0] for i in "ABCD"]
first_choices = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(example)]
first_answer = eng_dataset["train"]["clean_answer"][0]
first_exlanation = eng_dataset["train"]["explanation"][0]

second_question = eng_dataset["train"]["question"][1]
example = [eng_dataset["train"][i][1] for i in "ABCD"]
second_choices = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(example)]
second_answer = eng_dataset["train"]["clean_answer"][1]
second_exlanation = eng_dataset["train"]["explanation"][1]

third_question = eng_dataset["train"]["question"][2]
example = [eng_dataset["train"][i][2] for i in "ABCD"]
third_choices = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(example)]
third_answer = eng_dataset["train"]["clean_answer"][2]
third_exlanation = eng_dataset["train"]["explanation"][2]

In [24]:
idx = random.randint(0, len(eng_dataset["train"]))
question = eng_dataset["train"]["question"][idx]
choices = [eng_dataset["train"][i][idx] for i in "ABCD"]
# Prepare multiple-choice input
prompt = "Answer the question by selecting the correct choice (character A, B, C or D):\n"
answer_choices = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)]
input_text = f"## Question: {first_question}\n## Choices:\n" + "\n".join(first_choices) + f"\n## Answer: Let's think step by step. {first_exlanation} The answer is {first_answer} #" + \
f"\n\n## Question: {second_question}\n## Choices:\n" + "\n".join(second_choices) + f"\n## Answer: Let's think step by step. {second_exlanation} The answer is {second_answer} #" + \
f"\n\n## Question: {question}\n## Choices:\n" + "\n".join(answer_choices) + "\n## Answer: Let's think step by step."

# f"## Question: {first_question}\n## Choices:\n" + "\n".join(first_choices) + f"\n## Answer: {first_exlanation} The answer is {first_answer}" + \
# f"\n\n## Question: {second_question}\n## Choices:\n" + "\n".join(second_choices) + f"\n## Answer: {second_exlanation} The answer is {second_answer}" + \
# Encode the input
input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

# Generate multiple-choice response
outputs = model.generate(
    input_ids,
    # max_length=10,
    max_new_tokens=128,
    temperature=0.5, # randomness of the output
    # num_return_sequences=1,
    num_beams=5,
    # pad_token_id=tokenizer.eos_token_id,
    # do_sample=False,
)

# Decode and print the response
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# decoded_output = re.findall("## Answer: (.+) #", decoded_output)
print(decoded_output)

cot = re.findall("## Answer: (.+) #", decoded_output)
print("\n=> Chain-of-Thoughts: ", cot[2] if len(cot) > 2 else "None")
# Print ground truth for reference
print("=> Ground truth:", eng_dataset["train"]["clean_answer"][idx])

## Question: “42 months =.........year” The appropriate number of electricity put into place is:
## Choices:
A. 4
B. 4,2
C. 3,5
D. 35
## Answer: Let's think step by step. 42 months = 3.5 years. The answer is C #

## Question: The numbers of 6 hundreds and 3 units are:
## Choices:
A. 63
B. 36
C. 630
D. 603
## Answer: Let's think step by step. Numbers consisting of 6 hundreds and 3 units: 603 The answer is D #

## Question: a train is walking at 6 / 7 of its usual speed, the train is 20 minutes too late. find its usual time to cover the journey.
## Choices:
A. 2 hours
B. 30 min
C. 3 hours 20 min
D. 2 hours 30 min
## Answer: Let's think step by step. a train is walking at 6 / 7 of its usual speed, the train is 20 minutes too late. find its usual time to cover the journey. a train is walking at 6 / 7 of its usual speed, the train is 20 minutes too late. find its usual time to cover the journey. a train is walking at 6 / 7 of its usual speed, the train is 20 minutes too late. find its usual

In [25]:

def predict(idx, answers):
    question = test_dataset["question"][idx]
    choices = [test_dataset[i][idx] for i in "ABCD"]
    # Prepare multiple-choice input
    prompt = "Answer the question by selecting the correct choice (character A, B, C or D):\n"
    answer_choices = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)]
    input_text = f"## Question: {first_question} {prompt}\n" + "\n".join(first_choices) + f"\n## Answer: Let's think step by step. {first_exlanation} The answer is {first_answer} #" + \
    f"\n\n## Question: {second_question} {prompt}\n" + "\n".join(second_choices) + f"\n## Answer: Let's think step by step. {second_exlanation} The answer is {second_answer} #" + \
    f"\n\n## Question: {question} {prompt}\n" + "\n".join(answer_choices) + "\n## Answer: Let's think step by step."

    # input_text =f"Question: {question} Answer by selecting the correct choice (character A, B, C or D):\n" + "\n".join(answer_choices) + f"\nAnswer:<work>" 
    # Encode the input
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # Generate multiple-choice response
    outputs = model.generate(
        input_ids,
        # max_length=10,
        max_new_tokens=128,
        temperature=0.3, # randomness of the output
        # num_return_sequences=1,
        num_beams=5,
        # pad_token_id=tokenizer.eos_token_id,
        # do_sample=False,
    )

    # Decode and print the response
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    del input_ids
    del outputs
    torch.cuda.empty_cache()
    # print(decoded_output)

    cot = re.findall("## Answer: (.+) #", decoded_output)
    # print("\n=> Chain-of-Thoughts: ", cot[2] if len(cot) > 2 else "None")
    # # Print ground truth for reference
    # print("=> Ground truth:", test_dataset["clean_answer"][idx])
    if len(cot) > 2:
        answers[idx] = cot[2].split()[-1]
    # answers[idx] = decoded_output

answers = {key: None for key in range(len(test_dataset))}
for idx in tqdm(range(len(test_dataset))):
    predict(idx, answers)

100%|██████████| 189/189 [31:35<00:00, 10.03s/it]


In [27]:
answers

{0: 'B',
 1: None,
 2: 'B',
 3: 'D',
 4: 'C',
 5: 'A',
 6: 'D',
 7: 'B',
 8: 'D',
 9: 'D',
 10: 'C',
 11: 'D',
 12: 'D',
 13: 'A',
 14: 'D',
 15: 'D',
 16: 'A',
 17: None,
 18: None,
 19: 'C',
 20: 'C',
 21: 'C',
 22: 'D',
 23: 'A',
 24: 'C',
 25: 'D',
 26: 'A',
 27: 'D',
 28: 'A',
 29: 'A',
 30: None,
 31: 'C',
 32: 'D',
 33: 'D',
 34: 'B',
 35: 'D',
 36: 'C',
 37: 'B',
 38: 'D',
 39: 'B',
 40: None,
 41: 'D',
 42: 'D',
 43: 'A',
 44: 'D',
 45: 'A',
 46: 'D',
 47: 'A',
 48: 'C',
 49: 'B',
 50: 'A',
 51: 'A',
 52: 'A',
 53: 'A',
 54: 'D',
 55: 'A',
 56: 'D',
 57: 'A',
 58: 'D',
 59: 'D',
 60: None,
 61: 'A',
 62: 'C',
 63: 'C',
 64: 'C',
 65: None,
 66: 'B',
 67: 'D',
 68: 'A',
 69: 'C',
 70: None,
 71: 'A',
 72: 'A',
 73: 'D',
 74: 'C',
 75: 'A',
 76: 'C',
 77: None,
 78: 'C',
 79: 'C',
 80: 'A',
 81: 'C',
 82: 'A',
 83: 'A',
 84: 'A',
 85: 'A',
 86: 'A',
 87: 'D',
 88: 'D',
 89: 'B',
 90: 'C',
 91: 'B',
 92: 'A',
 93: 'A',
 94: 'C',
 95: 'A',
 96: 'D',
 97: 'A',
 98: 'A',
 99: 'A',
 

In [94]:
from collections import Counter
Counter(answers.values())

Counter({'A': 22,
         'C': 17,
         'D': 10,
         'B': 9,
         'Question: The average of 1.12; 2.78; 3 is: Answer by selecting the correct choice (character A, B, C or D):\nA. 3,93\nB. 20,70\nC. 6,90\nD. 2,3\nAnswer:\n\nThe average of 1.12; 2.78; 3 is (1.12+2.78+3)/3\n\ncalc_1.py\n```\nresult = ': 2,
         'Question: A store has sold 30% of its existing goods and earned VND 15, 000, 000. Ask how much money does the store earn if it sells all of its goods? Answer by selecting the correct choice (character A, B, C or D):\nA. 4 500 000 đồng\nB. 45 000 000 đồng\nC. 50 000 000 đồng\nD. 450 000 000 đồng\nAnswer:\n\nIf the store sells 30% of its existing goods and earned VND 15, 000, 000, the store earns VND 15, 000, 0': 1,
         "Question: A cyclist came from A at 7 o'clock at 12km/h. At 8 o'clock a motorcyclist also from A chased the cyclist at 42km/h. What time did the motorcyclist catch up with the cyclist? Answer by selecting the correct choice (character A, B, C o

In [53]:
test_dataset = test_dataset.add_column("raw_answer", answers.values())
test_dataset

Dataset({
    features: ['question', 'choices', 'clean_choices', 'A', 'B', 'C', 'D', 'id', 'raw_answer'],
    num_rows: 189
})

In [70]:
choices2idx = {choice: idx for idx, choice in enumerate("ABCD")}
choices2idx[None] = 2 # Only C
def get_answers(example):
    example["answer"] = example["choices"][choices2idx[example["raw_answer"]]] 
    return example

test_dataset = test_dataset.map(get_answers)

Map: 100%|██████████| 189/189 [00:00<00:00, 2720.57 examples/s]


In [71]:
import pandas as pd
df = pd.DataFrame({"id": test_dataset["id"], "answer": test_dataset["answer"]})
df.to_csv("./submissions/submission_7.csv", index=False)

---

In [None]:
# from peft import   (get_peft_config, 
#                     get_peft_model, 
#                     PromptTuningInit, 
#                     PromptTuningConfig, 
#                     TaskType, 
#                     PeftType, 
#                     LoraConfig)

# peft_config = LoraConfig(
#     r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, 
#     bias="none", inference_mode=False, 
#     target_modules=["query_proj", "value_proj"],
#     modules_to_save=['classifier','pooler'],
# )
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

# # DEBERTA LARGE HAS TOTAL OF 24 LAYERS
# FREEZE_LAYERS = 18
# # BOOLEAN TO FREEZE EMBEDDINGS
# FREEZE_EMBEDDINGS = True

# if FREEZE_EMBEDDINGS:
#     print('Freezing embeddings.')
#     for param in model.deberta.embeddings.parameters():
#         param.requires_grad = False
# if FREEZE_LAYERS>0:
#     print(f'Freezing {FREEZE_LAYERS} layers.')
#     for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
#         for param in layer.parameters():
#             param.requires_grad = False

In [None]:
import numpy as np
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids
    return {
        "accuracy": sum(predictions.argmax(axis=1) == labels) / len(labels), 
        "map@3": map_at_3(predictions.tolist(), labels.tolist())}

In [None]:
VER = 5

training_args = TrainingArguments(
    warmup_ratio=0.1, 
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    report_to='none',
    output_dir = f'./checkpoints/checkpoints_{VER}',
    overwrite_output_dir=True,
    # fp16=True,
    gradient_accumulation_steps=8,
    logging_steps=25,
    evaluation_strategy='steps',
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=2,
)

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch


In [None]:
from transformers import EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)],
)

# trainer.train()
# trainer.save_model(f'./best_model/model_v{VER}')

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss




In [None]:
# trainer = Trainer(model=model)

In [None]:
# import numpy as np
# test_predictions = trainer.predict(tokenized_dataset["test"]).predictions
# predictions_as_ids = np.argsort(-test_predictions, 1)
# predictions_as_answer_letters = np.array(list('ABCD'))[predictions_as_ids]
# # predictions_as_string = test_df['prediction'] = [
# #     ' '.join(row) for row in predictions_as_answer_letters[:, :3]
# # ]

# sum([index_to_option[i] for i in tokenized_dataset["test"]["label"]] == predictions_as_answer_letters[:, :1].squeeze())



50

In [None]:
len(tokenized_dataset["test"]["label"])

120

In [None]:
# import numpy as np
# test_predictions = trainer.predict(tokenized_dataset["test"]).predictions
# predictions_as_ids = np.argsort(-test_predictions, 1)
# predictions_as_answer_letters = np.array(list('ABCD'))[predictions_as_ids]

# sum([index_to_option[i] for i in tokenized_dataset["test"]["label"]] == predictions_as_answer_letters[:, :1].squeeze())

50

In [20]:
from transformers import ( 
                        AutoTokenizer,
                        AutoModelForSeq2SeqLM)
import torch
import re
import json 

test = json.loads(open('data/math_test.json').read())["data"]
test_df = pd.DataFrame(test)
test_df.set_index("id", inplace=True)

pattern = re.compile("[ABCD].")

def process(texts):
    return [pattern.split(text)[-1].strip() for text in texts]
test_df["clean_choices"] = test_df["choices"].apply(process)

choices = {choice: i for i, choice in enumerate("ABCD")} 

def make_choice(df, choice):
    idx = choices[choice]
    df[choice] = df["clean_choices"].apply(lambda x: x[idx] if idx < len(x) else "")
    return df
for choice in choices.keys():
    make_choice(test_df, choice)

test_dataset = Dataset.from_pandas(test_df)


tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en-v2", src_lang="vi_VN", cache_dir="./cache")
model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en-v2", cache_dir="./cache")
device_vi2en = torch.device("cuda")
model_vi2en.to(device_vi2en)

def translate_vi2en(example) -> str:
    for col in ["question", "A", "B", "C", "D"]:
        input_ids = tokenizer_vi2en(example[col], padding=True, return_tensors="pt").to(device_vi2en)
        output_ids = model_vi2en.generate(
            **input_ids,
            decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"],
            num_return_sequences=1,
            num_beams=5,
            early_stopping=True
        )
        example[col] = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True)
        # Free GPU memory
        del input_ids
        del output_ids
        torch.cuda.empty_cache()
    return example

test_dataset = test_dataset.map(translate_vi2en, batched=True, batch_size=32)

Map: 100%|██████████| 189/189 [00:21<00:00,  8.64 examples/s]


In [21]:
def preprocess(example):
    # nums = len(example["choices"])
    nums = 4
    choices = "ABCD" if nums == 4 else "ABC"
    first_sentence = [example['question']] * nums
    second_sentences = [example[option] for option in choices]
    tokenized_example = tokenizer(first_sentence, second_sentences, 
                                  truncation=True,max_length=MAX_INPUT, padding="max_length") 
                                #   if len(second_sentences[0]) < len(first_sentence[0]) else "only_second", 
                                #   max_length=MAX_INPUT, add_special_tokens=False, padding="max_length")
    # tokenized_example['label'] = option_to_index[example['clean_answer']]
    
    return tokenized_example

tokenized_test_dataset = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/189 [00:00<?, ? examples/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
tokenized_test_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 189
})

In [None]:
import numpy as np
test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCD'))[predictions_as_ids]

In [None]:
# test_df.drop(columns=test_df.columns, inplace=True)
test_df["id_ans"] = predictions_as_ids.squeeze().tolist()
test_df

Unnamed: 0_level_0,question,choices,clean_choices,A,B,C,D,id_ans
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01-0203,Một cửa hàng đã bán 30% số hàng hiện có và thu...,"[A. 4 500 000 đồng, B. 45 000 000 đồng, C. 50 ...","[4 500 000 đồng, 45 000 000 đồng, 50 000 000 đ...",4 500 000 đồng,45 000 000 đồng,50 000 000 đồng,450 000 000 đồng,"[0, 3, 1, 2]"
01-0206,Một người đi xe đạp từ A lúc 7 giờ với vận tốc...,"[A. 24 phút, B. 1 giờ, C. 7 giờ 24 phút, D. 8 ...","[24 phút, 1 giờ, 7 giờ 24 phút, 8 giờ 24 phút]",24 phút,1 giờ,7 giờ 24 phút,8 giờ 24 phút,"[1, 3, 2, 0]"
01-0207,Cạnh của hình lập phương gấp lên 2 lần thì diệ...,"[A. 2 lần, B. 4 lần, C. 6 lần, D. 8 lần]","[2 lần, 4 lần, 6 lần, 8 lần]",2 lần,4 lần,6 lần,8 lần,"[1, 2, 3, 0]"
01-0209,"Một thửa ruộng hình thang có đáy bé dài 8m, đá...","[A. 125m^{2}, B. 20%, C. 25%, D. 50%]","[125m^{2}, 20%, 25%, 50%]",125m^{2},20%,25%,50%,"[2, 3, 1, 0]"
01-0210,Một xe lửa vượt qua cái cầu dài 450m hết 45 gi...,"[A. 3m, B. 200m, C. 200m, D. 225m]","[3m, 200m, 200m, 225m]",3m,200m,200m,225m,"[3, 1, 2, 0]"
...,...,...,...,...,...,...,...,...
01-0698,"Kết quả phép nhân 4,51 \times 10 là:","[A. 451, B. 4,51, C. 45,1, D. 45]","[451, 4,51, 45,1, 45]",451,451,451,45,"[3, 1, 2, 0]"
01-0703,"Lớp 5/2 có 32 học sinh, trong đó có 12 học sin...","[A. 375 %, B. 37,5 %, C. 3,75 %, D. 0,375 %]","[375 %, 37,5 %, 3,75 %, 0,375 %]",375 %,"37,5 %","3,75 %","0,375 %","[1, 2, 0, 3]"
01-0715,Số thập phân thích hợp để điền vào chỗ chấm: 4...,"[A. 45,62, B. 4,562, C. 456,2, D. 4562]","[45,62, 4,562, 456,2, 4562]",4562,4562,4562,4562,"[3, 2, 0, 1]"
01-0716,"Kết quả của X trong biểu thức: X \div 2,04 = ...","[A. 3,03, B. 3,04, C. 3,05, D. 3,06]","[3,03, 3,04, 3,05, 3,06]",303,304,305,306,"[2, 3, 1, 0]"


In [None]:
test_df["answer"] = test_df.apply(lambda x: [x["choices"][int(i)] for i in x["id_ans"] if int(i) < len(x["choices"])][0], axis=1)
test_df.drop(columns=["question", "choices", "clean_choices", "A", "B", "C", "D", "id_ans"], inplace=True)
test_df

Unnamed: 0_level_0,answer
id,Unnamed: 1_level_1
01-0203,A. 4 500 000 đồng
01-0206,B. 1 giờ
01-0207,B. 4 lần
01-0209,C. 25%
01-0210,D. 225m
...,...
01-0698,D. 45
01-0703,"B. 37,5 %"
01-0715,D. 4562
01-0716,"C. 3,05"


In [None]:
test_df.to_csv(f"./submissions/submission_{VER}.csv")

In [None]:
# model = AutoModelForMultipleChoice.from_pretrained(model_path)
# model = get_peft_model(model, peft_config)
# checkpoint = torch.load(f'model_v{VER}/pytorch_model.bin')
# model.load_state_dict(checkpoint)