In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install deep-translator
!pip install pandas
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import pandas as pd
from tqdm import tqdm, trange
from deep_translator import GoogleTranslator

In [None]:
map_lang_data_trans = {
  'en': 'en',
  'fr': 'fr',
  'ge': 'de',
  'it': 'it',
  'po': 'pl',
  'ru': 'ru'
}

def get_all_languages():
    return [x for x in os.listdir('./drive/MyDrive/Facultate/MasterAnul2/NLP/data/')]


def make_dataframe(input_folder, labels_fn=None, language=None):
    # MAKE TXT DATAFRAME
    text = []
    for fil in tqdm(filter(lambda x: x.endswith('.txt'), os.listdir(input_folder))):
        iD = fil[7:].split('.')[0]
        lines = list(enumerate(open(input_folder + fil, 'r', encoding = 'utf-8').read().splitlines(), 1))
        text.extend([(iD,) + line for line in lines])

    df_text = pd.DataFrame(text, columns=['id', 'line', 'text'])
    df_text.id = df_text.id.apply(int)
    df_text.line = df_text.line.apply(int)
    df_text = df_text[df_text.text.str.strip().str.len() > 0].copy()
    df_text = df_text.set_index(['id', 'line'])

    df_text['language'] = language

    df = df_text

    if labels_fn:
        # MAKE LABEL DATAFRAME
        labels = pd.read_csv(labels_fn, sep='\t', encoding = 'utf-8', error_bad_lines=False, header=None)
        labels = labels.rename(columns={0: 'id', 1: 'line', 2: 'labels'})
        labels = labels.set_index(['id', 'line'])
        labels = labels[labels.labels.notna()].copy()

        # JOIN
        df = labels.join(df_text)[['text', 'labels', 'language']]

    return df


def load_train_and_dev_all():
    all_languages = get_all_languages()
    test_dataset = pd.DataFrame()
    dev_dataset = pd.DataFrame()
    for language in all_languages:
       print("language", language)
       test_dataset = pd.concat([test_dataset, make_dataframe(f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/train-articles-subtask-3/",
                                                              f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/train-labels-subtask-3.txt", 
                                                              language=language)])
       dev_dataset = pd.concat([dev_dataset, make_dataframe(  f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/dev-articles-subtask-3/",
                                                              f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/dev-labels-subtask-3.txt", 
                                                              language=language)])

    test_dataset = test_dataset.sample(frac=1)
    dev_dataset = dev_dataset.sample(frac=1)
    return test_dataset, dev_dataset

In [None]:
df1, df2 = load_train_and_dev_all()  

language en


446it [00:04, 98.25it/s] 


  test_dataset = pd.concat([test_dataset, make_dataframe(f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/train-articles-subtask-3/",
90it [00:00, 179.42it/s]


  dev_dataset = pd.concat([dev_dataset, make_dataframe(  f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/dev-articles-subtask-3/",


language fr


158it [00:00, 168.82it/s]
53it [00:00, 277.79it/s]


language ge


132it [00:01, 128.34it/s]
45it [00:00, 118.48it/s]


language it


227it [00:02, 105.17it/s]
76it [00:00, 287.96it/s]


language po


145it [00:00, 161.79it/s]
49it [00:00, 238.82it/s]


language ru


143it [00:00, 187.91it/s]
48it [00:00, 307.14it/s]


In [None]:
display(df1)

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels,language
id,line,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2553,9,Pieniądze z Funduszu Odbudowy nadal będą więc ...,"Appeal_to_Fear-Prejudice,Loaded_Language,Quest...",po
2524,2,Koronawirus – spiskowe teorie czy spiskowa pra...,Obfuscation-Vagueness-Confusion,po
2350,7,En manifestant contre Zemmour à chacun de ses ...,Appeal_to_Hypocrisy,fr
787002327,12,"One thing we know for sure is that, despite th...","Loaded_Language,Name_Calling-Labeling,Repetition",en
765029945,13,Every country arrests people. Just about every...,"Appeal_to_Fear-Prejudice,Repetition",en
...,...,...,...,...
2251,17,Wie reagieren die Regierungen auf diese Situat...,"Appeal_to_Hypocrisy,Name_Calling-Labeling",ge
23162,19,"Macron, le petit banquier d’affaires de la ban...","Appeal_to_Fear-Prejudice,Consequential_Oversim...",fr
26127,10,Noi SIAMO l'ombrello della Nato.,Conversation_Killer,it
2251,35,Dies ist die traurige Realität der Situation: ...,"Doubt,Name_Calling-Labeling",ge


In [None]:
display(df2)

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels,language
id,line,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23210,5,"Des propos qui ne sont, pour ainsi dire, pas a...",Loaded_Language,fr
23161,11,Est-ce l’Union Européenne qui nous impose à no...,"Doubt,Loaded_Language,Questioning_the_Reputation",fr
24116,3,Как выяснило британское консервативное издание...,Doubt,ru
813953435,16,The figure is more than double the 25 per cent...,Loaded_Language,en
2498,29,— Сам этот термин «несостоявшееся государство»...,Doubt,ru
...,...,...,...,...
2250,27,„Empfang von 2 Dosen COVID-19 vax schützt zu k...,"Appeal_to_Authority,Doubt",ge
26219,107,"Non è possibile minimizzarlo, perché viene dop...",Repetition,it
2537,13,Jednak – jak stwierdził burmistrz – najważniej...,Exaggeration-Minimisation,po
2611,1,Washington è pronta a fare guerre di sanzioni ...,"Exaggeration-Minimisation,Name_Calling-Labeling",it


In [None]:
map_lang_data_trans = {   
  'en': 'en',  
  'fr': 'fr',
  'ge': 'de',
  'it': 'it',  
  'po': 'pl',
  'ru': 'ru' 
}

for index, row in df1.iterrows():

    for dest_lang in get_all_languages():
      if dest_lang != row['language']:

          gt_dest_lang = map_lang_data_trans[dest_lang]
          gt_source_lang = map_lang_data_trans[row['language']]
          translated_text = GoogleTranslator(source=gt_source_lang, target=gt_dest_lang).translate(row['text'])
          
          transl_data_path = f"./drive/MyDrive/Facultate/MasterAnul2/NLP/translated_data/{dest_lang}/translated_data.txt"
          with open(transl_data_path, 'a') as f:
            f.write(f"{translated_text}\t{row['labels']}\n")