# Imports

In [142]:
'''! pip install torch
! pip install tensorflow
! pip install transformers
! pip install flax'''

'! pip install torch\n! pip install tensorflow\n! pip install transformers\n! pip install flax'

In [2]:
import pandas as pd
from transformers import AutoTokenizer

# MMLU Translated

In [144]:
df_translated = pd.read_json('../DATASETS/mmlu.json')
print((df_translated.shape, df_translated.question_id.nunique()))
df_translated.head()

((15573, 11), 15573)


Unnamed: 0,question_id,question,question_status,choice_1,choice_1_status,choice_2,choice_2_status,choice_3,choice_3_status,choice_4,choice_4_status
0,0,Encontre o grau para a extensão de campo dada ...,ok,0,skipped,4,skipped,2,skipped,6,skipped
1,1,"Seja p = (1, 2, 5, 4)(2, 3) em S_5 . Encontre ...",ok,8,skipped,2,skipped,24,skipped,120,skipped
2,2,Encontre todos os zeros no campo finito indica...,ok,0,skipped,1,skipped,01,skipped,04,skipped
3,3,Afirmação 1 | Um grupo de fatores de um grupo ...,ok,"Verdade, Verdade",ok,"Falso, Falso",ok,"Verdadeiro, Falso",ok,"Falso, Verdadeiro",ok
4,4,Encontre o produto dos polinômios dados no ane...,ok,2x^2 + 5,ok,6x^2 + 4x + 6,ok,0,skipped,x^2 + 1,ok


## Error analysis

Verificar as questions que deu skip e cancelar os ids dela.

In [3]:
df_translated.query("question_status == 'skipped'").shape

NameError: name 'df_translated' is not defined

In [146]:
ids_to_skipped = df_translated.query("question_status == 'skipped'")['question_id'].unique()
ids_to_skipped

array([1044, 1086, 5751, 5788])

In [147]:
df_translated = df_translated[~df_translated['question_id'].isin(ids_to_skipped)]
df_translated.shape

(15569, 11)

Verificar os casos de erro, pois eles não devem garantir a validade da instância.

In [148]:
errors_ids = df_translated[
    (df_translated['choice_1_status'] == 'erro') |
    (df_translated['choice_2_status'] == 'erro') |
    (df_translated['choice_3_status'] == 'erro') |
    (df_translated['choice_4_status'] == 'erro')
]

error_ids_list = errors_ids['question_id'].tolist()
print("IDs com erros nas escolhas:", error_ids_list, "Quantidade:", len(error_ids_list))

IDs com erros nas escolhas: [185, 709, 13346, 14396] Quantidade: 4


In [149]:
df_translated = df_translated[~df_translated['question_id'].isin(error_ids_list)]
df_translated.shape

(15565, 11)

# Format Figurative

In [150]:
def create_dataframe(df, col_name):
    dict_names = {'question':'Q1','choice_1':'C1', 'choice_2':'C2', 'choice_3':'C3', 'choice_4':'C4' }
    return df[["question_id", col_name]].rename(columns={col_name: "ORIGINAL"}).assign(ID=lambda x: x["question_id"].astype(str) + dict_names[col_name])

df_question = create_dataframe(df_translated, "question")
df_choice_1 = create_dataframe(df_translated, "choice_1")
df_choice_2 = create_dataframe(df_translated, "choice_2")
df_choice_3 = create_dataframe(df_translated, "choice_3")
df_choice_4 = create_dataframe(df_translated, "choice_4")

final_df = pd.concat([df_question, df_choice_1, df_choice_2, df_choice_3, df_choice_4], ignore_index=True)
final_df = final_df[['ID', 'ORIGINAL']]
final_df.shape, final_df.ID.nunique()

((77825, 2), 77825)

In [151]:
assert final_df.shape[0] == df_translated.shape[0] * 5, "Erro: O número de linhas no dataframe final não corresponde ao esperado (5 vezes o original). Verifique o processamento dos dados."

In [152]:
final_df.head()

Unnamed: 0,ID,ORIGINAL
0,0Q1,Encontre o grau para a extensão de campo dada ...
1,1Q1,"Seja p = (1, 2, 5, 4)(2, 3) em S_5 . Encontre ..."
2,2Q1,Encontre todos os zeros no campo finito indica...
3,3Q1,Afirmação 1 | Um grupo de fatores de um grupo ...
4,4Q1,Encontre o produto dos polinômios dados no ane...


# Estimate tokens

In [None]:
def calculate_tokens(df, columns, tokenizer_model="gpt2"):
    """
    Calcula o número de tokens para colunas especificadas em um DataFrame usando um tokenizer.

    Args:
        df (pd.DataFrame): DataFrame de entrada contendo os dados de texto.
        columns (list): Lista de colunas do DataFrame para calcular os tokens.
        tokenizer_model (str): Nome do modelo para carregar o tokenizer. Padrão é "gpt2".

    Returns:
        pd.DataFrame: DataFrame atualizado com colunas contendo o número de tokens.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

    def count_tokens(text):
        if not text or not isinstance(text, str):
            return 0
        return len(tokenizer.encode(text, truncation=True))

    for column in columns:
        token_column_name = f"TOKENS_EST_{column.upper()}"
        df[token_column_name] = df[column].apply(count_tokens)

    total_tokens_column = "TOTAL_TOKENS"
    token_columns = [f"TOKENS_EST_{col.upper()}" for col in columns if f"TOKENS_EST_{col.upper()}" in df.columns]

    if token_columns:
        df[total_tokens_column] = df[token_columns].sum(axis=1)
    else:
        df[total_tokens_column] = 0
    return df

df_to_figurative_final = calculate_tokens(final_df, ['ID', 'ORIGINAL'])
print(f"To figurative:{df_to_figurative_final.shape}, ids únicos: {df_to_figurative_final['ID'].nunique()}")
df_to_figurative_final.head()

# Save dataframe

In [154]:
df_to_figurative_final.to_csv("../DATASETS/mmlu_to_figurative.csv", index=False)

# MMLU-PRO

In [5]:
df_translated_pro = pd.read_json('../DATASETS/mmlu-pro.json')
print("MMLU-PRO", (df_translated_pro.shape, df_translated_pro.question_id.nunique()))
df_translated_pro.head(2)

MMLU-PRO ((12102, 23), 12102)


Unnamed: 0,question_id,question,question_status,choice_1,choice_1_status,choice_2,choice_2_status,choice_3,choice_3_status,choice_4,...,choice_6,choice_6_status,choice_7,choice_7_status,choice_8,choice_8_status,choice_9,choice_9_status,choice_10,choice_10_status
0,0,O grupo simétrico $S_n$ tem $\n\factorial{n}$ ...,ok,0,skipped,30,skipped,3,skipped,10,...,50,skipped,2,skipped,100,skipped,20,skipped,5,skipped
1,1,Seja V o conjunto de todos os polinômios reais...,ok,ST + TS é o mapa identidade de V sobre si mesmo.,ok,TS = 0,ok,ST = 1,ok,ST - TS = 0,...,ST = 0,ok,ST = TS,ok,ST - TS é o mapa de identidade de V sobre si m...,ok,TS = T,ok,ST = S,ok


In [6]:
df_translated_pro.query("question_status == 'skipped'").shape

(4, 23)

In [8]:
ids_to_skipped = df_translated_pro.query("question_status == 'skipped'")['question_id'].unique()
ids_to_skipped

array([ 4790,  4982, 10362, 10592])

In [9]:
df_translated_pro = df_translated_pro[~df_translated_pro['question_id'].isin(ids_to_skipped)]
df_translated_pro.shape

(12098, 23)

In [13]:
errors_ids = df_translated_pro[
    (df_translated_pro['question_status'] == 'erro') |
    (df_translated_pro['choice_1_status'] == 'erro') |
    (df_translated_pro['choice_2_status'] == 'erro') |
    (df_translated_pro['choice_3_status'] == 'erro') |
    (df_translated_pro['choice_4_status'] == 'erro') |
    (df_translated_pro['choice_5_status'] == 'erro') |
    (df_translated_pro['choice_6_status'] == 'erro') |
    (df_translated_pro['choice_7_status'] == 'erro') |
    (df_translated_pro['choice_8_status'] == 'erro') |
    (df_translated_pro['choice_9_status'] == 'erro') |
    (df_translated_pro['choice_10_status'] == 'erro') 
]

error_ids_list = errors_ids['question_id'].tolist()
print("IDs com erros nas escolhas:", error_ids_list, "Quantidade:", len(error_ids_list))

IDs com erros nas escolhas: [3280, 3527, 6731] Quantidade: 3


In [14]:
df_translated_pro = df_translated_pro[~df_translated_pro['question_id'].isin(error_ids_list)]
df_translated_pro.shape

(12095, 23)

In [15]:
def create_dataframe(df, col_name):
    dict_names = {'question':'Q1','choice_1':'C1', 'choice_2':'C2', 'choice_3':'C3', 'choice_4':'C4', 'choice_5':'C5',
                  'choice_6':'C6', 'choice_7':'C7', 'choice_8':'C8', 'choice_9':'C9', 'choice_10':'C10' }

    return df[["question_id", col_name]].rename(columns={col_name: "ORIGINAL"}).assign(ID=lambda x: x["question_id"].astype(str) + dict_names[col_name])

df_question = create_dataframe(df_translated_pro, "question")
df_choice_1 = create_dataframe(df_translated_pro, "choice_1")
df_choice_2 = create_dataframe(df_translated_pro, "choice_2")
df_choice_3 = create_dataframe(df_translated_pro, "choice_3")
df_choice_4 = create_dataframe(df_translated_pro, "choice_4")
df_choice_5 = create_dataframe(df_translated_pro, "choice_5")
df_choice_6= create_dataframe(df_translated_pro, "choice_6")
df_choice_7 = create_dataframe(df_translated_pro, "choice_7")
df_choice_8 = create_dataframe(df_translated_pro, "choice_8")
df_choice_9 = create_dataframe(df_translated_pro, "choice_9")
df_choice_10 = create_dataframe(df_translated_pro, "choice_10")

final_df_pro = pd.concat([df_question, df_choice_1, df_choice_2, df_choice_3, df_choice_4, df_choice_5,
                           df_choice_6, df_choice_7, df_choice_8, df_choice_9, df_choice_10], ignore_index=True)

final_df_pro = final_df_pro[['ID', 'ORIGINAL']]

final_df_pro.shape, final_df_pro.ID.nunique()

((133045, 2), 133045)

In [16]:
assert final_df_pro.shape[0] == df_translated_pro.shape[0] * 11, "Erro: O número de linhas no dataframe final não corresponde ao esperado (11 vezes o original). Verifique o processamento dos dados."

In [19]:
df_to_figurative_final_pro = calculate_tokens(final_df_pro, ['ID', 'ORIGINAL'])
print(f"To figurative:{df_to_figurative_final_pro.shape}, ids únicos: {df_to_figurative_final_pro['ID'].nunique()}")
df_to_figurative_final_pro.head()

To figurative:(133045, 5), ids únicos: 133045


Unnamed: 0,ID,ORIGINAL,TOKENS_EST_ID,TOKENS_EST_ORIGINAL,TOTAL_TOKENS
0,0Q1,O grupo simétrico $S_n$ tem $\n\factorial{n}$ ...,3,61,64
1,1Q1,Seja V o conjunto de todos os polinômios reais...,3,110,113
2,2Q1,Seja A o conjunto de todos os pares ordenados ...,3,69,72
3,3Q1,Um tanque contém inicialmente uma solução sali...,3,165,168
4,4Q1,Um total de 30 jogadores jogarão basquete em u...,3,63,66


In [20]:
df_to_figurative_final_pro.to_csv("../DATASETS/mmlu_pro_to_figurative.csv", index=False)