In [58]:
import pandas as pd
import os

In [59]:
# Use arabic as master df, we will add the other languages to it
master_df = pd.read_csv('../Archive/en-ar-tat.tsv', sep='\t')
master_df.columns=['source_id', 'en', 'trans_id', 'ar']

for file in os.listdir('../Archive/'):
    lang = file.split('-')[1]
    if lang == 'ar':
        continue

    df = pd.read_csv(f'../Archive/{file}', sep='\t', on_bad_lines='skip')
    df.columns=['source_id', 'en', 'trans_id', lang]

    master_df = master_df.merge(
            df[['source_id', lang]], 
            on='source_id', 
            how='left',
            suffixes=('', f'_{lang}')
        )
    print(lang, ' added to master df')

print(master_df.head())



es  added to master df
de  added to master df
ja  added to master df
fr  added to master df
it  added to master df
   source_id                    en  trans_id            ar  \
0       1276  Let's try something.    461821  لنجرب شيئاً!   
1       1276  Let's try something.    461821  لنجرب شيئاً!   
2       1276  Let's try something.    461821  لنجرب شيئاً!   
3       1276  Let's try something.    461821  لنجرب شيئاً!   
4       1276  Let's try something.    461821  لنجرب شيئاً!   

                      es                              de   ja  \
0  Permíteme hacer algo.   Lasst uns etwas ausprobieren!  NaN   
1  Permíteme hacer algo.   Lasst uns etwas ausprobieren!  NaN   
2  Permíteme hacer algo.  Lass uns mal was ausprobieren.  NaN   
3  Permíteme hacer algo.  Lass uns mal was ausprobieren.  NaN   
4       Intentemos algo.   Lasst uns etwas ausprobieren!  NaN   

                        fr   it  
0  Tentons quelque chose !  NaN  
1  Essayons quelque chose.  NaN  
2  Tentons quelque 

Remove all rows with duplicate source IDs

In [60]:
master_df = master_df.drop_duplicates(subset=['source_id'], keep='first')
master_df.drop(columns=['trans_id'], inplace=True)

print(master_df.head())



     source_id                                                en  \
0         1276                              Let's try something.   
8         1277                            I have to go to sleep.   
200       1280  Today is June 18th and it is Muiriel's birthday!   
236       1282                                Muiriel is 20 now.   
252       1283                        The password is "Muiriel".   

                                                    ar  \
0                                         لنجرب شيئاً!   
8                                        عليّ أن أنام.   
200  اليوم هو الثامن عشر من يونيو و هو عيد ميلاد مو...   
236                       عمر ميوريل عشرون عاماً الآن.   
252                              كلمة السر هي "موريل".   

                                                    es  \
0                                Permíteme hacer algo.   
8                             Tengo que irme a dormir.   
200  ¡Hoy es 18 de junio y es el cumpleaños de Muir...   
236       

In [61]:
for column in master_df.columns:
    if column not in ['source_id', 'en']:
        master_df[column] = '<' + column + '> ' + master_df[column].astype(str)

print(master_df.head())

     source_id                                                en  \
0         1276                              Let's try something.   
8         1277                            I have to go to sleep.   
200       1280  Today is June 18th and it is Muiriel's birthday!   
236       1282                                Muiriel is 20 now.   
252       1283                        The password is "Muiriel".   

                                                    ar  \
0                                    <ar> لنجرب شيئاً!   
8                                   <ar> عليّ أن أنام.   
200  <ar> اليوم هو الثامن عشر من يونيو و هو عيد ميل...   
236                  <ar> عمر ميوريل عشرون عاماً الآن.   
252                         <ar> كلمة السر هي "موريل".   

                                                    es  \
0                           <es> Permíteme hacer algo.   
8                        <es> Tengo que irme a dormir.   
200  <es> ¡Hoy es 18 de junio y es el cumpleaños de...   
236       

In [64]:
master_df['combined'] = master_df[master_df.columns[2:]].astype(str).agg(' '.join, axis=1)
master_df = master_df[['en', 'combined']]

print(master_df.head())

master_df.to_csv('../data/tatoeba.csv', index=False)


                                                   en combined
0                                Let's try something.         
8                              I have to go to sleep.         
200  Today is June 18th and it is Muiriel's birthday!         
236                                Muiriel is 20 now.         
252                        The password is "Muiriel".         
