In [1]:
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset, load_from_disk, Dataset, concatenate_datasets
import pandas as pd
from transformers import ( 
                        Trainer,
                        TrainingArguments,
                        AutoTokenizer,
                        AutoConfig,
                        AutoModel,
                        AutoModelForCausalLM,
                        AutoModelForMultipleChoice,
                        AutoModelForSeq2SeqLM,
                        default_data_collator,
                        get_linear_schedule_with_warmup,
                        TextStreamer)
import random
import numpy as np
import warnings
warnings.filterwarnings("ignore")

VER = 12
V1_MODEL = True

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
  
# model_path = "vinai/PhoGPT-7B5-Instruct" 
mmlu_dataset = load_dataset("lukaemon/mmlu", "elementary_mathematics", cache_dir="./cache")
mathqa_dataset = load_dataset("math_qa", cache_dir="./cache")
merged_dataset = mmlu_dataset["train"]
for subset_name, subset_data in mmlu_dataset.items():
    if subset_name != 'train':
        merged_dataset = concatenate_datasets([merged_dataset, subset_data])

eng_dataset = load_from_disk("processed_train_eng.hf")
eng_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 1080
    })
    test: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 120
    })
})

In [2]:

import re
import random

mathqa_concat_dataset = mathqa_dataset["train"]
for subset_name, subset_data in mathqa_dataset.items():
    if subset_name != 'train':
        mathqa_concat_dataset = concatenate_datasets([mathqa_concat_dataset, subset_data])

mathqa_concat_dataset = mathqa_concat_dataset.rename_columns({
    "Problem": "question",
    "Rationale": "explanation",
    "correct": "clean_answer",
    "options": "choices"
})

mathqa_concat_dataset = mathqa_concat_dataset.remove_columns(column_names=["annotated_formula", "linear_formula", "category"])
choices = {choice: i for i, choice in enumerate("ABCD")} 
idx2choices = {i: choice for i, choice in enumerate("ABCD")} 
pattern = re.compile("[abcde] \)")
def process_options(example):
    options = example["choices"].split(", ")
    example["choices"] = options
    if example["clean_answer"] == "e":
        idx = random.randint(0, 3)
        options[idx], options[-1] = options[-1], options[idx]
        example["clean_answer"] = idx2choices[idx]
    for choice, i in choices.items():
        example[choice] = re.sub(pattern, "", options[i]).replace("'", "").replace("]", "").replace("[", "").strip()
    
    example["clean_answer"] = example["clean_answer"].upper() 
    example["answer"] = options[choices[example["clean_answer"]]]
    
    return example

mathqa_concat_dataset = mathqa_concat_dataset.map(process_options)
mathqa_concat_dataset

Dataset({
    features: ['question', 'explanation', 'choices', 'clean_answer', 'A', 'B', 'C', 'D', 'answer'],
    num_rows: 37297
})

In [3]:
merged_dataset = merged_dataset.rename_columns({
    "input": "question",
    "target": "clean_answer",
})
merged_dataset = merged_dataset.add_column("explanation", [""] * len(merged_dataset))
def add_answer(example):
    example["answer"] = example[example["clean_answer"]]
    return example
merged_dataset = merged_dataset.map(add_answer)
merged_dataset

Dataset({
    features: ['question', 'A', 'B', 'C', 'D', 'clean_answer', 'explanation', 'answer'],
    num_rows: 421
})

In [4]:
eng_dataset["train"] = concatenate_datasets([eng_dataset["train"], merged_dataset, mathqa_concat_dataset])
eng_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 38798
    })
    test: Dataset({
        features: ['question', 'choices', 'explanation', 'answer', 'clean_choices', 'clean_answer', 'A', 'B', 'C', 'D', 'id'],
        num_rows: 120
    })
})

In [5]:
"1,5".isnumeric()

False

In [6]:
from transformers import ( 
                        AutoTokenizer,
                        AutoModelForSeq2SeqLM)
import torch
import re
import json 

test = json.loads(open('data/math_test.json').read())["data"]
test_df = pd.DataFrame(test)
test_df.set_index("id", inplace=True)

pattern = re.compile("[ABCD].")

def process(texts):
    return [pattern.split(text)[-1].strip() for text in texts]
test_df["clean_choices"] = test_df["choices"].apply(process)

choices = {choice: i for i, choice in enumerate("ABCD")} 

def make_choice(df, choice):
    idx = choices[choice]
    df[choice] = df["clean_choices"].apply(lambda x: x[idx] if idx < len(x) else "")
    return df
for choice in choices.keys():
    make_choice(test_df, choice)




tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en-v2", src_lang="vi_VN", cache_dir="./cache")
model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en-v2", cache_dir="./cache")
device_vi2en = torch.device("cuda")
model_vi2en.to(device_vi2en)



def preprocess_text(text):
    number_replacements = []
    def replace_numbers(match):
        number = match.group(0)
        number_replacements.append(number)
        return "NUMBER"

    # Replace numbers with a placeholder
    text = re.sub(r'(\d+)', replace_numbers, text)
    return text, number_replacements

def restore_numbers(translated_text, number_replacements):
    for number in number_replacements:
        translated_text = translated_text.replace("NUMBER", number, 1)
    return translated_text

units = {"kg": "kg", 
         "km": "km", 
         "km/giờ": "km/giờ",
         "cm": "cm", 
         "cm2": "cm2",
         "đồng": "VND",
         "%": "%",
         "dm": "dm",
         "phút": "minutes",
         "giờ": "hours",
         "giây": "seconds",}


def check_number(choice):
    # choice = str(choice)
    check = False
    for unit, eng_unit in units.items():
        if unit in choice:
            choice = choice.replace(unit, eng_unit)
            check = True
    if len(re.findall("(\d+[%gm])", choice)) > 0 or choice.replace(" ", "").replace(",", "").replace(".", "").isnumeric():
        check = True
    if choice == "":
        check = True
    return check, choice
    
    
def translate_choice(example):
    for col in ["question", "A", "B", "C", "D"]:
        # print(example[col])
        if col in "ABCD":
            check, example[col] = check_number(example[col])
            if check:
                continue
        
        input_ids = tokenizer_vi2en(example[col], padding=True, return_tensors="pt").to(device_vi2en)
        output_ids = model_vi2en.generate(
            **input_ids,
            decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"],
            num_return_sequences=1,
            num_beams=1,
            early_stopping=True
        )
        example[col] = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True)[0]
        # example[col] = restore_numbers(example[col], number_replacements)
        # Free GPU memory
        del input_ids
        del output_ids
        torch.cuda.empty_cache()
    
    return example

test_df = test_df.apply(lambda row: translate_choice(row), axis=1)
test_dataset = Dataset.from_pandas(test_df)       
test_dataset

Dataset({
    features: ['question', 'choices', 'clean_choices', 'A', 'B', 'C', 'D', 'id'],
    num_rows: 189
})

In [7]:
pd.set_option("max_colwidth", None)
test_df

Unnamed: 0_level_0,question,choices,clean_choices,A,B,C,D
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01-0203,"A store sold 30% of its existing stock and earned VND15, 000, 000. Ask how much money the store earned if it sold out?","[A. 4 500 000 đồng, B. 45 000 000 đồng, C. 50 000 000 đồng, D. 450 000 000 đồng]","[4 500 000 đồng, 45 000 000 đồng, 50 000 000 đồng, 450 000 000 đồng]",4 500 000 VND,45 000 000 VND,50 000 000 VND,450 000 000 VND
01-0206,A cyclist from A started at 7 o'clock at 12km/h. At 8 o'clock a motorcyclist from A also chased the cyclist at 42km/h. What time did the motorcyclist catch up with the cyclist?,"[A. 24 phút, B. 1 giờ, C. 7 giờ 24 phút, D. 8 giờ 24 phút]","[24 phút, 1 giờ, 7 giờ 24 phút, 8 giờ 24 phút]",24 minutes,1 hours,7 hours 24 minutes,8 hours 24 minutes
01-0207,"If the side of the cube is 2 times larger, how many times is the area around it larger?","[A. 2 lần, B. 4 lần, C. 6 lần, D. 8 lần]","[2 lần, 4 lần, 6 lần, 8 lần]",2 times,4 times,6 times,8 times
01-0209,"A trapezoidal field has a small bottom of 8m long and a large bottom of 12m long. Extending the large bottom by 5m, the area of the field increases by 25m2. Ask how much the area of the field increases by?","[A. 125m^{2}, B. 20%, C. 25%, D. 50%]","[125m^{2}, 20%, 25%, 50%]",125m^{2},20%,25%,50%
01-0210,"A train that crosses a 450 meter bridge takes 45 seconds, a power pole takes 15 seconds.","[A. 3m, B. 200m, C. 200m, D. 225m]","[3m, 200m, 200m, 225m]",3m,200m,200m,225m
...,...,...,...,...,...,...,...
01-0698,The result of multiplication 4.51 \times 10 is:,"[A. 451, B. 4,51, C. 45,1, D. 45]","[451, 4,51, 45,1, 45]",451,451,451,45
01-0703,"Class 5/2 has 32 students, including 12 excellent students. The percentage of the number of excellent students and students in the class is:","[A. 375 %, B. 37,5 %, C. 3,75 %, D. 0,375 %]","[375 %, 37,5 %, 3,75 %, 0,375 %]",375 %,"37,5 %","3,75 %","0,375 %"
01-0715,The appropriate decimal number to fill in the dot place: 4 tons 562 kg =......... tons is:,"[A. 45,62, B. 4,562, C. 456,2, D. 4562]","[45,62, 4,562, 456,2, 4562]",4562,4562,4562,4562
01-0716,The result of X in the expression: X \div 2.04 = 7.5 \div 5 is:,"[A. 3,03, B. 3,04, C. 3,05, D. 3,06]","[3,03, 3,04, 3,05, 3,06]",303,304,305,306


In [8]:
del model_vi2en
del tokenizer_vi2en
torch.cuda.empty_cache()

In [9]:
model_name = 'Intel/neural-chat-7b-v3-1'
device = "cuda:1"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir="./cache/")
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="./cache/")
model.to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0): MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
      (1): MistralDecoderLayer(
   

In [56]:
def make_prompt(question, choices, answer=None):
    ans = answer if answer else ""
    return f"""Question: {question}\
\n{choices}\nAnswer: {ans}"""

def _few_shot(example, shots, k):
    def _make_choice(example):
        choices = [example[i] for i in "ABCD"]
        # Prepare multiple-choice input
        choices = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
        return choices
    return "\n".join([make_prompt(shots["question"][i], _make_choice(shots[i]), shots["answer"][i]) for i in range(k)] + 
                        [make_prompt(example["question"], _make_choice(example))])

def make_prompt_v1(question, choices, answer=None):
    ans = answer if answer else ""
    return f"""Multi-choice question: {question}\
\n{choices}{ans}"""

def _few_shot_v1(example, shots, k):
    def _make_choice(example):
        choices = [example[i] for i in "ABCD"]
        # Prepare multiple-choice input
        choices = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
        return choices
    
    def _make_prompt(example, include_ans=True):
        question = example["question"]
        if include_ans:
            ans = example["answer"] if example["answer"] else "" 
        choices = _make_choice(example)
        if include_ans == False:
            return f"""Question: {question}\
\n{choices}"""
        if example.get("explanation", None) and example["explanation"] != "It's not like we're going to have to do this.":
            explanation = example["explanation"]
            return f"""Question: {question}\
\n{choices}\nExplanation: {explanation}\nAnswer: {ans}"""
        else:
            explanation = ""
            return f"""Question: {question}\
\n{choices}\nAnswer: {ans}"""

    _prompt = "\n".join([_make_prompt(shots[i]) for i in range(k)] + 
                        [_make_prompt(example, False)])
    system = """### System: You are a math expert assistant. Your mission is to help users understand \
and solve elementary math problems: You must strictly follow the multi choice question and the choices \
from users, First you need to think step by step and then give the answer choice, which is A, B, C or D \
corresponding with the choices."""
    complete_prompt = system + "\n### User:\n" + _prompt + f"\n### Assitant:\nAnswer:" # \nExplanation: {explanation}
    return complete_prompt
    
print(_few_shot_v1(eng_dataset["train"][1], eng_dataset["train"], 3))

### System: You are a math expert assistant. Your mission is to help users understand and solve elementary math problems: You must strictly follow the multi choice question and the choices from users, First you need to think step by step and then give the answer choice, which is A, B, C or D corresponding with the choices.
### User:
Question: “42 months =.........year” The appropriate number of electricity put into place is:
A. 4
B. 4,2
C. 3,5
D. 35
Explanation: 42 months = 3.5 years.
Answer: C. 3,5
Question: The numbers of 6 hundreds and 3 units are:
A. 63
B. 36
C. 630
D. 603
Explanation: Numbers consisting of 6 hundreds and 3 units: 603
Answer: D. 603
Question: "The area of a square with sides of 1.2dm is equal to.........dm3." The number of points to be inserted is:
A. 1,44
B. 1,728
C. 8,64
D. 5,76
Answer: B. 1,728
Question: The numbers of 6 hundreds and 3 units are:
A. 63
B. 36
C. 630
D. 603
### Assitant:
Answer:


In [11]:
import re
pattern = re.findall("([A-D]). (.+)", "B. \\frac{7}{10}")
pattern

[('B', '\\frac{7}{10}')]

In [42]:
units = {"kg": "kg", 
         "km": "km", 
         "km/giờ": "km/hour",
         "cm": "cm", 
         "cm2": "cm2",
         "đồng": "VND",
         "%": "%",
         "dm": "dm",
         "phút": "minutes",
         "giờ": "hours",
         "giây": "seconds",
         "tạ": "quintal",
         ">": ">",
         "<": "<",
         "=": "="}
def check_number(choice):
    # choice = str(choice)
    check = False
    for unit, eng_unit in units.items():
        if unit in choice and (re.search(f"\d+\s*{unit}", choice) or unit == choice.strip()):
            choice = choice.replace(unit, eng_unit)
            check = True
    if len(re.findall("(\d+[%gm])", choice)) > 0 or choice.replace(" ", "").replace(",", "").replace(".", "").isnumeric() or choice == "":
        check = True
    return check, choice

check_number("")

(True, '')

In [26]:
n_shots = 5
idx = random.randint(0, 1200)

example = eng_dataset["train"][idx]

prompt = _few_shot_v1(example, eng_dataset["train"], n_shots)
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda:1")
# outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
outputs = model.generate(inputs, max_new_tokens=300, streamer=streamer, temperature=0.1, num_beams = 1,
                            top_k = 50,
                            return_dict_in_generate=True, do_sample=False, pad_token_id=tokenizer.eos_token_id)
answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
# print("## ", re.findall("\nAnswer: \n?(.+)", answer)[n_shots])
# print(answer)

def postprocess(answer, n_shots, v1=False): # TODO: Có lẽ có thể sử dụng chung postprocess cho v1
    if not v1:
        answer = re.findall("\nAnswer: \n?(.+)", answer)
        if len(answer) > n_shots:
            answer = answer[n_shots]
        else: 
            answer = None
        return answer
    else:
        answer = re.findall("Answer: (.+)[\n\</s>]?", answer.split("### Assitant:\n")[-1])
        if len(answer) >= 1:
            return answer[0]
        else:
            return answer

# print(prompt)
print("\n### Ground truth: ", example["clean_answer"])

# print(postprocess(answer, 5, v1=True))

<s> ### System: You are a math expert assistant. Your mission is to help users understand and solve elementary math problems: You must strictly follow the multi choice question and the choices from users, First you need to think step by step and then give the answer choice, which is A, B, C or D corresponding with the choices.
### User:
Question: “42 months =.........year” The appropriate number of electricity put into place is:
A. 4
B. 4,2
C. 3,5
D. 35
Explanation: 42 months = 3.5 years.
Answer: C. 3,5
Question: The numbers of 6 hundreds and 3 units are:
A. 63
B. 36
C. 630
D. 603
Explanation: Numbers consisting of 6 hundreds and 3 units: 603
Answer: D. 603
Question: "The area of a square with sides of 1.2dm is equal to.........dm3." The number of points to be inserted is:
A. 1,44
B. 1,728
C. 8,64
D. 5,76
Answer: B. 1,728
Question: Today is May 28th. There are 5 days left for you to attend the summer camp organized by the city. What day will you attend the camp?
A. Ngày 1 tháng 6
B. Ng

Assitant:
Answer: B. 4960
Explanation: First, we need to calculate 403 * 6, which is 403 * 6 = 2418. Then, we add this result to 2342: 2342 + 2418 = 4760. Finally, we need to convert the answer to the given units (km, m, etc.). Since no units are given, we can assume it's in the same units as the original numbers, which are in thousands. So, 4760 becomes 4960.</s>

### Ground truth:  D


In [None]:
import torch
tokenizer.pad_token = tokenizer.eos_token

def eval_on_trainset(answers=[], count=0, num_samples=1200, n_shots=5, batch_size=8):
    bar = tqdm(range(0, num_samples, batch_size))
    
    for batch_start in bar: 
        batch_end = min(batch_start + batch_size, num_samples)
        
        examples = eng_dataset["train"][batch_start:batch_end]
        examples = [dict(zip(examples,t)) for t in zip(*examples.values())]
        prompts = [_few_shot_v1(example, eng_dataset["train"], n_shots) for example in examples]
        
        inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").input_ids.to("cuda:1")
        
        outputs = model.generate(inputs, 
                                 max_new_tokens=300, 
                                 temperature=0.1, 
                                 top_p=1,
                                 num_beams=1,
                                 top_k=50,
                                 return_dict_in_generate=True, 
                                 do_sample=False, 
                                 pad_token_id=tokenizer.eos_token_id
                                )
        
        generated_answers = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
        generated_answers = [postprocess(answer, n_shots, True) for answer in generated_answers]
        
        for idx, (generated_answer, example) in enumerate(zip(generated_answers, examples)):
            answers.append(generated_answer)
            if generated_answer.split(".")[0] not in "ABCD":
                generated_answer = "C" # only C for question that have no answer
            
            if generated_answer.split(".")[0] == example["clean_answer"] or generated_answer in example["answer"]:
                count += 1
        
        accuracy = count / batch_end
        bar.set_postfix({"Accuracy": accuracy})
    
    return answers

answers = eval_on_trainset()

In [12]:
prompts = _few_shot_v1(test_dataset[9], eng_dataset["train"], n_shots) 
        
inputs = tokenizer(prompts, return_tensors="pt").input_ids.to("cuda:1")

outputs = model.generate(inputs, streamer=streamer,
                        max_new_tokens=300, 
                        temperature=0.1, 
                        top_p=1,
                        num_beams=1,
                        top_k=50,
                        return_dict_in_generate=True, 
                        do_sample=False, 
                        pad_token_id=tokenizer.eos_token_id
                        )

# generated_answers = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
# generated_answers = postprocess(generated_answers, n_shots, True) 

<s> ### System: You are a math expert assistant. Your mission is to help users understand and solve elementary math problems: You must strictly follow the multi choice question and the choices from users, First you need to think step by step and then give the answer choice, which is A, B, C or D corresponding with the choices.
### User:
Question: “42 months =.........year” The appropriate number of electricity put into place is:
A. 4
B. 4,2
C. 3,5
D. 35
Answer: 
Question: The numbers of 6 hundreds and 3 units are:
A. 63
B. 36
C. 630
D. 603
Answer: 
Question: "The area of a square with sides of 1.2dm is equal to.........dm3." The number of points to be inserted is:
A. 1,44
B. 1,728
C. 8,64
D. 5,76
Answer: 
Question: Today is May 28th. There are 5 days left for you to attend the summer camp organized by the city. What day will you attend the camp?
A. Ngày 1 tháng 6
B. Ngày 2 tháng 6
C. Ngày 3 tháng 6
D. Ngày 4 tháng 6
Answer: 
Question: 3km 48m =........km.
A. 3,48
B. 3,048
C. 348
D. 304

Assitant:
Answer: 
Question: The numbers of 6 hundreds and 3 units are:
A. 63
B. 36
C. 630
D. 603
Answer: 
Question: "The area of a square with sides of 1.2dm is equal to.........dm3." The number of points to be inserted is:
A. 1,44
B. 1,728
C. 8,64
D. 5,76
Answer: 
Question: Today is May 28th. There are 5 days left for you to attend the summer camp organized by the city. What day will you attend the camp?
A. Ngày 1 tháng 6
B. Ngày 2 tháng 6
C. Ngày 3 tháng 6
D. Ngày 4 tháng 6
Answer: 
Question: 3km 48m =........km.
A. 3,48
B. 3,048
C. 348
D. 3048
Answer: 
Question: The number sixty - seven point nine hundred and nineteen is written:
A. 67,910
B. 679
C


In [60]:
def write_to_submission(answers=[], n_shots=5, batch_size=16):
    def inference(example):
        prompts = _few_shot_v1(example, eng_dataset["train"], n_shots) 
        
        inputs = tokenizer(prompts, return_tensors="pt").input_ids.to("cuda:1")
        
        outputs = model.generate(inputs, 
                                max_new_tokens=300, 
                                temperature=0.1, 
                                top_p=1,
                                num_beams=1,
                                top_k=50,
                                return_dict_in_generate=True, 
                                do_sample=False, 
                                pad_token_id=tokenizer.eos_token_id
                                )
        
        generated_answers = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
        generated_answers = postprocess(generated_answers, n_shots, True) 
        answers.append(generated_answers)
        example["answer"] = None
        for e in example["choices"]:
            if generated_answers == e or generated_answers in e:
                example["answer"] = e
                break      
        
        return {"id": example["id"],
                "answer": example["answer"]}
    
    infer_dataset = test_dataset.map(inference, batch_size=batch_size, )
    df = infer_dataset.to_pandas()
    return df, answers
        
df, results = write_to_submission()

Map: 100%|██████████| 189/189 [06:22<00:00,  2.02s/ examples]


In [77]:
results

['D. 450 000 000 VND',
 'A. 24 minutes',
 'B. 4 times',
 'C. 25%',
 'Cannot be determined. The given information does not provide enough details to calculate the speed of the train or the distance it covers in 15 seconds.',
 'B. 5,621',
 'C. 21%',
 'A. 7 hours 17 minutes',
 'C. 0,75',
 'A. 67,910',
 'B. 24',
 'D. 39870',
 'A. 70',
 'D. Five tenths.',
 'B. 15%',
 'A. \\frac{1}{8}',
 'C. 350',
 'A. 5,190',
 'D. 0,018 hours',
 'B. 3,6992',
 'C. 5 000',
 'C. 37,4',
 'D. 10,5',
 'D. 3,05',
 'C. 90 minutes',
 'B. 30 dm^{2}',
 'A. 9,42 dm',
 'A. 1,14 cm^{2}',
 'A. 1380 dm^{3}',
 'C. 46%',
 'A. 3,76',
 'D. 201,07',
 'D. 0,8',
 'D. 40 minutes',
 'B. 0,75',
 'C. 30 minutes',
 'D. 123,4',
 'C. 9 minutes 36 seconds',
 'B. 3,048',
 'A. 270 students',
 'B. \\frac{1}{2}',
 'D. 80',
 'D. 81 cm',
 'A. 70,765',
 'A. 10dm',
 'B. 55,0017',
 'C. 40%',
 'A. \\frac{9}{10}',
 'B. 4,8',
 'A. 25 m^{3}',
 'C. 6',
 'A. 850 m/minutes',
 'A. 9998',
 'B. 20',
 'A. 2,83',
 'A. 200 tạ',
 'C. 50,24 dm^{2}',
 'A. 10dm',

In [75]:
def transformer_to_dialog(example):
    dialogs = []
    question = example["question"]
    choices = [example[i] for i in "ABCD"]
    # Prepare multiple-choice input
    choices = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
    answer = example["answer"]
    explanation = example["explanation"] if example["explanation"] not in ["", None] else None
    dialog = [
        {"role": "system", "content": """### System: You are a math expert assistant. Your mission is to help users understand \
and solve elementary math problems: You must strictly follow the multi choice question and the choices \
from users, First you need to think step by step and then give the answer choice, which is A, B, C or D \
corresponding with the choices."""}
    ]
    if explanation:
        dialog += [
        {"role": "user", "content": f"Question: {question}\n{choices}"},
        {"role": "assistant", "content": f"Explanation: {explanation}\nAnswer: {answer}"}
        ]
    else:
        dialog += [
        {"role": "user", "content": f"Question: {question}\nWhich of the following is the correct choice: {choices}"},
        {"role": "assistant", "content": f"Answer: {answer}"}
        ]

    dialogs.append(dialog)
        
    return {"dialog": dialogs}
SYS_PREFIX = "<<SYS>>"
SYS_POSTFIX = " <</SYS>> "
INST_PREFIX = "<s> [INST] "
INST_POSTFIX = " "
OUTPUT_PREFIX = "[/INST] "
OUTPUT_POSTFIX = "</s>"

def preprocess(data_point, tokenizer, cutoff_len):
    dialog = data_point['dialog']

    roles = [msg["role"] for msg in dialog[0]]
    messages = [msg["content"] for msg in dialog[0]]

    assert roles[0].upper() != "ASSISTANT"
    assert roles[-1].upper() == "ASSISTANT"

    input_messages = []
    if roles[0].upper() == "SYSTEM":
        input_messages.append(SYS_PREFIX+messages[0]+SYS_POSTFIX)

    for role, msg in zip(roles, messages):
        if role.upper() == "ASSISTANT":
            input_messages.append(msg + " " + OUTPUT_POSTFIX)
        elif role.upper() == "USER":
            input_messages.append(INST_PREFIX + msg + INST_POSTFIX + OUTPUT_PREFIX)

    tokenized_input = tokenizer(input_messages, add_special_tokens=False)

    input_ids = []
    labels = []

    if roles[0].upper() == "SYSTEM":
        input_ids.extend(tokenized_input.input_ids[0])
        labels.extend([-100]*len(tokenized_input.input_ids[0]))

    for role, msg in zip(roles, tokenized_input.input_ids):

        if role.upper() == "USER":
            labels.extend([-100]*len(msg))
            input_ids.extend(msg)
        
        elif role.upper() == "ASSISTANT":
            labels.extend(msg)
            input_ids.extend(msg)


    input_ids = torch.LongTensor(input_ids)[:cutoff_len]
    labels = torch.LongTensor(labels)[:cutoff_len]

    assert input_ids.shape == labels.shape

    return {
        "input_ids": input_ids,
        "labels": labels
    }

dataset = eng_dataset.map(transformer_to_dialog, remove_columns=eng_dataset["train"].column_names)

train_dialogs = dataset["train"]
val_dialogs = dataset["test"]["dialog"]
train_dialogs.shuffle().map(preprocess, fn_kwargs={"tokenizer":tokenizer, "cutoff_len": 100})

Map:   0%|          | 0/38798 [00:00<?, ? examples/s]

Map: 100%|██████████| 38798/38798 [01:18<00:00, 493.09 examples/s]


Dataset({
    features: ['dialog', 'input_ids', 'labels'],
    num_rows: 38798
})

In [72]:
train_dialogs["dialog"][0][0]

[{'content': '### System: You are a math expert assistant. Your mission is to help users understand and solve elementary math problems: You must strictly follow the multi choice question and the choices from users, First you need to think step by step and then give the answer choice, which is A, B, C or D corresponding with the choices.',
  'role': 'system'},
 {'content': 'Question: “42 months =.........year” The appropriate number of electricity put into place is:\nA. 4\nB. 4,2\nC. 3,5\nD. 35',
  'role': 'user'},
 {'content': 'Explanation: 42 months = 3.5 years.\nAnswer: C. 3,5',
  'role': 'assistant'}]

In [None]:
pd.set_option("colwidth", None)
temp = pd.read_csv("submissions/submission_9.csv", usecols=["id", "answer"])
df["old_answer"] = temp["answer"]
df[["question", "answer", "old_answer"]].sample(50)

Unnamed: 0,question,answer,old_answer
151,Which number does 0.75 have to be multiplied by to get 7.5? Let's circle the letter before the correct result.,C. 10,B. 100
32,The largest of the numbers: 0.79; 0.789; 0.709; 0.8 is:,"D. 0,8","D. 0,8"
82,The number 9.6 written as a mixed number is :,A. 9 \frac{6}{10},A. 9 \frac{6}{10}
80,Calculation: 1286.35 + 534.85 has the result:,"A. 1821,2","A. 1821,2"
101,Write \frac{5}{10} as a decimal:,"C. 0,05","B. 0,5"
112,3 \frac{9}{100} written as a decimal is:,"D. 3,90","A. 3,900"
100,2 dam^{2} 49 m^{2} =.........m^{2},A. 2049,B. 2490
62,For 2km 257m =...... km. The number filled in the dot place is,"D. 2,257","B. 2,257"
156,The percentage ratios of 16 and 50 are:,"B. 3,2%","B. 3,2%"
41,0.2 m^{3} times 25 dm^{3} is:,"B. 0,8","B. 0,8"


In [None]:
df[["id", "answer"]].to_csv(f"submissions/submission_{VER}.csv", index=False)