In [81]:
import pandas as pd
import json
import os

In [82]:
lang_files = {}
for file in os.listdir('../data/train'):
    try:
        path = f'../data/train/{file}'
        jsonObj = pd.read_json(path_or_buf=f'../data/train/{file}/train.jsonl', lines=True)
        lang_files[file.split(".")[0]] = jsonObj
    except Exception as e:
        print(file, e)
    


lang_files['es'].columns



Index(['id', 'source_locale', 'target_locale', 'source', 'target', 'entities',
       'from'],
      dtype='object')

Accepts a question id, a list of languages and returns if that ID is present in all

In [83]:
multilingual_df = pd.DataFrame(columns=['q_id', 'en', 'ar', 'de', 'es', 'fr', 'it', 'ja'])
langs = ['ar', 'de', 'es', 'fr', 'it', 'ja']
ids_in_all = []

def id_in_all(question_id:str, languages:list=langs, exists_in_all:bool=True, write:bool=False):
    for lang in languages:
        if question_id not in lang_files[lang].id.values:
            exists_in_all = False
            return exists_in_all
    
    if write:
        row_data = {'q_id': question_id}
        
        en_text = lang_files['es'].loc[lang_files['es']['id'] == question_id, 'source'].iloc[0]
        row_data['en'] = en_text
        
        for lang in langs:
            target_text = lang_files[lang].loc[lang_files[lang]['id'] == question_id, 'target'].iloc[0]
            row_data[lang] = ' <' + lang + '> ' + target_text
        
        multilingual_df.loc[len(multilingual_df)] = row_data
    

for id in lang_files['es'].id.values:
    id_in_all(id, write=True)

print(multilingual_df.head())

       q_id                                                 en  \
0  f477742c  Which actor was Stephenie Meyer's first choice...   
1  650e81a3             What is the longest lake in the world?   
2  33ed28dd  Who was a member of the 2004 U.S. Olympic men'...   
3  9508d042  How many Pulitzer Prizes has Colson Whitehead ...   
4  47feccd4  Which movie had a higher lifetime gross, Juras...   

                                                  ar  \
0   <ar> مَن الممثل الذي وقع عليه الاختيار الأول ...   
1                      <ar> ما أطول بحيرة في العالم؟   
2   <ar> مَن كان عضوًا في فريق الولايات المتحدة ا...   
3   <ar> كم عدد جوائز "البوليتزر" التي فاز بها كو...   
4   <ar> أي فيلم حصل على إيرادات أعلى طول الحياة،...   

                                                  de  \
0   <de> Welcher Schauspieler war Stephanie Meyer...   
1         <de> Welcher See ist der längste der Welt?   
2   <de> Wer gehörte 2004 zur olympischen, Schwim...   
3   <de> Wie viele Pulitzer-Preise gewann 

In [84]:
columns_to_combine = langs
multilingual_df['combined'] = multilingual_df[columns_to_combine].astype(str).agg(' '.join, axis=1)

# Keep only 'en' and 'combined' columns
multilingual_df = multilingual_df[['en', 'combined']]

print("\nFirst few rows:")
print(multilingual_df.head())


First few rows:
                                                  en  \
0  Which actor was Stephenie Meyer's first choice...   
1             What is the longest lake in the world?   
2  Who was a member of the 2004 U.S. Olympic men'...   
3  How many Pulitzer Prizes has Colson Whitehead ...   
4  Which movie had a higher lifetime gross, Juras...   

                                            combined  
0   <ar> مَن الممثل الذي وقع عليه الاختيار الأول ...  
1   <ar> ما أطول بحيرة في العالم؟  <de> Welcher S...  
2   <ar> مَن كان عضوًا في فريق الولايات المتحدة ا...  
3   <ar> كم عدد جوائز "البوليتزر" التي فاز بها كو...  
4   <ar> أي فيلم حصل على إيرادات أعلى طول الحياة،...  


In [85]:
multilingual_df.to_csv('../data/multilingual.csv', index=True)