# Data Pre-processing

In [27]:
# Read the data:
import json

with open('data/math_train.json') as f:
    math_train = json.load(f)
with open('data/math_test.json') as f:
    math_test = json.load(f)

In [28]:
def format_data_point(data_point):
    question = data_point["question"]
    choices = " ".join(data_point["choices"])
    formatted_text = f"Question: {question} Choices: {choices}"
    answer = data_point['answer']
    return formatted_text, answer

formatted_train_data = [format_data_point(dp) for dp in math_train["data"]]

In [29]:
def format_data_point_test(data_point):
    question = data_point["question"]
    choices = " ".join(data_point["choices"])
    formatted_text = f"Question: {question} Choices: {choices}"
    id = data_point['id']
    return id, formatted_text

formatted_test_data = [format_data_point_test(dp) for dp in math_test['data']]

# Add the Model

In [30]:

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers.generation import GenerationConfig
import torch
import re
tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en-v2", use_fast=False)
model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en-v2")
device_vi2en = torch.device("cuda:2")
model_vi2en.to(device_vi2en)

RE_MATCH=r'((?:[\.,][0-9])+)'

def translate_vi2en(vi_texts: str) -> str:
    input_ids = tokenizer_vi2en(vi_texts, padding=True, return_tensors="pt").to(device_vi2en)
    output_ids = model_vi2en.generate(
        **input_ids,
        decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"],
        num_return_sequences=1,
        num_beams=5,
        early_stopping=True
    )
    en_texts = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True)
    return ' '.join(en_texts)

## Translate train data

In [31]:
import copy
import re

def preserve_numbers_translate(text, translate_function):
    text = re.sub(r'(\d+),(\d+)', r'\1.\2', text)
    numbers = re.findall(r'\d+[\d\s.]*', text)
    placeholder_text = re.sub(r'\d+[\d\s.]*', '{}', text)
    translated_text_with_placeholders = translate_function(placeholder_text)
    parts = translated_text_with_placeholders.split('{}')
    translated_text_with_numbers = ''.join([part + (num if i < len(numbers) else '') for i, (part, num) in enumerate(zip(parts, numbers + ['']))])
    translated_text_with_numbers = re.sub(' +', ' ', translated_text_with_numbers)
    return translated_text_with_numbers.strip()

def remove_latex_pos(text):
    return text.replace('{\\pos [1, 2]}', '')

eng_math_train = []
for i in range(0, len(math_train['data'])):
    A = copy.deepcopy(math_train['data'][i])
    A['question'] = remove_latex_pos(preserve_numbers_translate(A['question'], translate_vi2en))
    
    for j in range(len(A['choices'])): 
        choice_text = A['choices'][j][2:] # Extract text without option letter
        translated_choice = remove_latex_pos(preserve_numbers_translate(choice_text, translate_vi2en))
        A['choices'][j] = A['choices'][j][:2] + translated_choice # Combine option letter with translated text
        
    if 'explanation' in A:
        A['explanation'] = remove_latex_pos(preserve_numbers_translate(A['explanation'], translate_vi2en))
    else:
        # Handle the case where 'explanation' key is missing
        A['explanation'] = "Easy question"
        
    A['answer'] = A['answer'][:2] + remove_latex_pos(preserve_numbers_translate(A['answer'][2:], translate_vi2en))
    
    eng_math_train.append(A)

In [35]:
# Save data

data = {
    '__count__': len(eng_math_train),
    'data': eng_math_train
}

# Save to JSON
with open("data/eng_math_train.json", "w") as f:
    json.dump(data, f, indent=3)  # Use indent for better readability (optional)

## Translate test data

In [33]:
import copy
import re

def preserve_numbers_translate(text, translate_function):
    text = re.sub(r'(\d+),(\d+)', r'\1.\2', text)
    numbers = re.findall(r'\d+[\d\s.]*', text)
    placeholder_text = re.sub(r'\d+[\d\s.]*', '{}', text)
    translated_text_with_placeholders = translate_function(placeholder_text)
    parts = translated_text_with_placeholders.split('{}')
    translated_text_with_numbers = ''.join([part + (num if i < len(numbers) else '') for i, (part, num) in enumerate(zip(parts, numbers + ['']))])
    translated_text_with_numbers = re.sub(' +', ' ', translated_text_with_numbers)
    return translated_text_with_numbers.strip()

def remove_latex_pos(text):
    return text.replace('{\\pos [1, 2]}', '')

eng_math_test = []
for i in range(0, len(math_test['data'])):
    A = copy.deepcopy(math_test['data'][i])
    A['question'] = remove_latex_pos(preserve_numbers_translate(A['question'], translate_vi2en))
    for j in range(len(A['choices'])): 
        choice_text = A['choices'][j][3:] # Extract text without option letter
        translated_choice = remove_latex_pos(preserve_numbers_translate(choice_text, translate_vi2en))
        A['choices'][j] = A['choices'][j][:3] + translated_choice # Combine option letter with translated text
    eng_math_test.append(A)

In [34]:
# Save data

data = {
    '__count__': len(eng_math_test),
    'data': eng_math_test
}

# Save to JSON
with open("data/eng_math_test.json", "w") as f:
    json.dump(data, f, indent=3)  # Use indent for better readability (optional)