In [67]:
from datasets import load_dataset
from datasets import get_dataset_config_names
import pandas as pd
# Load the FLORES-200 dataset for a specific language pair
dataset = load_dataset("facebook/flores", "eng_Latn-fra_Latn")

In [68]:
flores_configs = get_dataset_config_names("facebook/flores","all")
print(f"facebook/flores contains {len(flores_configs)} language pairs")

# Filter out the non-English to English language pairs
flores_configs_non_english_to_english = [
    pair for pair in flores_configs 
    if pair.endswith("-eng_Latn") and not pair.startswith("eng_Latn-")
]

Using the latest cached version of the module from /home/snt/.cache/huggingface/modules/datasets_modules/datasets/facebook--flores/2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef (last modified on Mon Dec 23 13:52:01 2024) since it couldn't be found locally at facebook/flores, or remotely on the Hugging Face Hub.


facebook/flores contains 41617 language pairs


In [73]:
ab_to_full_name_df = pd.read_csv("results_translator - language pair.csv")

In [74]:
all_data = []
for pair in flores_configs_non_english_to_english:
    print (f"Processing {pair}")
    dataset = load_dataset("facebook/flores", pair, split="devtest")
    for item in dataset:
        item["language_pair"] = pair
        
        item['source_language'] = item['language_pair'].split('-')[0]
        item['target_language'] = item['language_pair'].split('-')[1]

        item['source_sentence'] = item[f"sentence_{item['source_language']}"]
        item['target_sentence'] = item[f"sentence_{item['target_language']}"]

        source_full_name = ab_to_full_name_df.loc[ab_to_full_name_df['Abreviation'] == item['source_language'], 'Full name'].values[0]
        target_full_name = ab_to_full_name_df.loc[ab_to_full_name_df['Abreviation'] == item['target_language'], 'Full name'].values[0]

        item['source_language_full_name'] = source_full_name
        item['target_language_full_name'] = target_full_name

        all_data.append(item)

Processing ace_Arab-eng_Latn
Processing ace_Latn-eng_Latn
Processing acm_Arab-eng_Latn
Processing acq_Arab-eng_Latn
Processing aeb_Arab-eng_Latn
Processing afr_Latn-eng_Latn
Processing ajp_Arab-eng_Latn
Processing aka_Latn-eng_Latn
Processing als_Latn-eng_Latn
Processing amh_Ethi-eng_Latn
Processing apc_Arab-eng_Latn
Processing arb_Arab-eng_Latn
Processing arb_Latn-eng_Latn
Processing ars_Arab-eng_Latn
Processing ary_Arab-eng_Latn
Processing arz_Arab-eng_Latn
Processing asm_Beng-eng_Latn
Processing ast_Latn-eng_Latn
Processing awa_Deva-eng_Latn
Processing ayr_Latn-eng_Latn
Processing azb_Arab-eng_Latn
Processing azj_Latn-eng_Latn
Processing bak_Cyrl-eng_Latn
Processing bam_Latn-eng_Latn
Processing ban_Latn-eng_Latn
Processing bel_Cyrl-eng_Latn
Processing bem_Latn-eng_Latn
Processing ben_Beng-eng_Latn
Processing bho_Deva-eng_Latn
Processing bjn_Arab-eng_Latn
Processing bjn_Latn-eng_Latn
Processing bod_Tibt-eng_Latn
Processing bos_Latn-eng_Latn
Processing bug_Latn-eng_Latn
Processing bul

In [75]:
df = pd.DataFrame(all_data)
df = df[["id","URL","domain","topic","language_pair","source_language","target_language","source_sentence","target_sentence","source_language_full_name","target_language_full_name"]]

### Translation task

In [76]:
from tqdm import tqdm

tqdm.pandas()

def generate_prompt(row):
    return f"""You are a professional translator. Your task is to translate the following text from {row['source_language_full_name']} to {row['target_language_full_name']}. Provide only the translation without any additional explanation.

{row['source_sentence']}

"""

df['input'] = df.progress_apply(generate_prompt, axis=1)
save_df = df.copy()
save_df.reset_index(inplace=True, names="index")
columns_to_save = ["index","source_language_full_name","target_language_full_name","input"]
save_df = save_df[columns_to_save]
save_df.to_json("data/flores-200-translation.jsonl", orient="records", lines=True)

100%|██████████| 205436/205436 [00:05<00:00, 34990.86it/s]


In [77]:
from tqdm import tqdm

tqdm.pandas()

def generate_understanding_prompt(row):
    return f"""You are a professional linguist. Your task is to understand and explain the meaning of the following text written in {row['source_language_full_name']}.

{row['source_sentence']}

### Explanation:
Please explain the meaning of the text in your own words.
"""

df['input'] = df.progress_apply(generate_understanding_prompt, axis=1)
save_df = df.copy()
save_df.reset_index(inplace=True, names="index")
columns_to_save = ["index","source_language_full_name","target_language_full_name","input"]
save_df = save_df[columns_to_save]
save_df.to_json("data/flores-200-explanation.jsonl", orient="records", lines=True)

100%|██████████| 205436/205436 [00:04<00:00, 44227.81it/s]
