In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
import os
import re
import glob
import pandas as pd
from natsort import natsorted
import unicodedata

### Variabili Globali

In [63]:
root_folder = 'drive/MyDrive/BERT/'
data_folder_ita = 'data/Traduzioni/'
data_folder_dante = 'data/Dante/'
data_folder_out = 'data/'

# PATH
DATA_PATH_IN_ITA = os.path.abspath(os.path.join(root_folder, data_folder_ita))
DATA_PATH_IN_DANTE = os.path.abspath(os.path.join(root_folder, data_folder_dante))
DATA_PATH_OUT = os.path.abspath(os.path.join(root_folder, data_folder_out))

# File utilizzati per l'addestramento
data_filename = ['francese-italiano.tsv',
                 'inglese-italiano.tsv',
                 'olandese-italiano.tsv',
                 'spagnolo-italiano.tsv',
                 'svedese-italiano.tsv',
                 'tedesco-italiano.tsv']

# File utilizzati per l'addestramento
data_filename_dante = ['fiore_it.csv',
                       'fiore_it_en.csv',
                       'fiore_en_de.csv',
                       'fiore_en_fr.csv']

# Colonne
ORIGINAL_COLUMN_ITA = 'Original'
TRANSLATE_COLUMN_ITA = 'Translate'

TRANSLATE_COLUMN_DANTE = 'Original'
ORIGINAL_COLUMN_DANTE = 'Translate'
ORIGINAL_COLUMN_DANTE_GPT = 'Translate_GPT_1'

TYPE_COLUMN = 'Type'

file_out = 'train_data.csv'
NUM_SAMPLES = 100000

In [57]:
def verifica_correttezza(df):
  original = df['Original']
  translate = df['Translate']

  print(f'Esempi nel Dataset                 : {len(df)}')
  print(f'Frase più corta in Original        : {min(original, key = len)}')
  print(f'Frase più corta in Translate       : {min(translate, key = len)}')
  print(f'Frase più lunga in Original        : {max(original, key = len)}')
  print(f'Frase più lunga in Translate       : {max(translate, key = len)}')
  print('---------------------------------------------------------------------------------------')

In [58]:
def union_df_train_ita(data_filenames, path_file, type_dataset):
  df = pd.DataFrame(columns=[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA, TYPE_COLUMN])
  
  for filename in data_filenames:
    filename_path = os.path.abspath(os.path.join(path_file, filename))

    df_train = pd.read_csv(
      filename_path,
      sep="\t",
      header=None,
      names=[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA],
      usecols=[1,3]
    )

    df_train = df_train.sort_values(by=ORIGINAL_COLUMN_ITA, 
                                    key=lambda x: x.str.len(),
                                    ascending=False)

    df_train = df_train[:NUM_SAMPLES]
    df_train[TYPE_COLUMN] = type_dataset      

    print(f'File name                          : {filename}')
    verifica_correttezza(df_train)

    df = pd.concat([df[[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA, TYPE_COLUMN]],
                    df_train[[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA, TYPE_COLUMN]]])
  
    # df = df.append(df_train)

  return df.reset_index(drop=True)

In [59]:
def union_df_train_dante(data_filenames, path_file, type_dataset):
  df = pd.DataFrame(columns=[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA, TYPE_COLUMN])
  
  for filename in data_filenames:
    filename_path = os.path.abspath(os.path.join(path_file, filename))

    if "fiore" in filename:
      df_train = pd.read_csv(
        filename_path,
        usecols=[ORIGINAL_COLUMN_DANTE, ORIGINAL_COLUMN_DANTE_GPT, TRANSLATE_COLUMN_DANTE],
        dtype={ORIGINAL_COLUMN_DANTE: str, ORIGINAL_COLUMN_DANTE_GPT: str, TRANSLATE_COLUMN_DANTE: str}
      )

      df_train = pd.concat([df_train[[ORIGINAL_COLUMN_DANTE, TRANSLATE_COLUMN_DANTE]], 
                            df_train[[ORIGINAL_COLUMN_DANTE_GPT, TRANSLATE_COLUMN_DANTE]]. \
                            rename(columns={ORIGINAL_COLUMN_DANTE_GPT: ORIGINAL_COLUMN_DANTE})])

      df_train = df_train.rename(columns={ORIGINAL_COLUMN_DANTE: ORIGINAL_COLUMN_ITA,
                                          TRANSLATE_COLUMN_DANTE: TRANSLATE_COLUMN_ITA})
      
      verifica_correttezza(df_train)
      df_train[TYPE_COLUMN] = type_dataset   
      
      df = pd.concat([df[[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA, TYPE_COLUMN]],
                      df_train[[ORIGINAL_COLUMN_DANTE, TRANSLATE_COLUMN_DANTE, TYPE_COLUMN]]])
    
      # df = df.append(df_train)
    
    else:
      df_train = pd.read_csv(
        filename_path,
        usecols=[ORIGINAL_COLUMN_DANTE, TRANSLATE_COLUMN_DANTE],
        dtype={ORIGINAL_COLUMN_DANTE: str, TRANSLATE_COLUMN_DANTE: str}
      )

      df_train = df_train.rename(columns={ORIGINAL_COLUMN_DANTE: ORIGINAL_COLUMN_ITA,
                                          TRANSLATE_COLUMN_DANTE: TRANSLATE_COLUMN_ITA})
          
      print(f'File name                          : {filename}')
      verifica_correttezza(df_train)
      df_train[TYPE_COLUMN] = type_dataset      

      df = pd.concat([df[[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA, TYPE_COLUMN]],
                      df_train[[ORIGINAL_COLUMN_DANTE, TRANSLATE_COLUMN_DANTE, TYPE_COLUMN]]])
    
      # df = df.append(df_train)

  return df.reset_index(drop=True)

In [62]:
# Caricamento dataset
df = union_df_train_ita(data_filename, DATA_PATH_IN_ITA, 'ITA')

df_dante = union_df_train_dante(data_filename_dante, DATA_PATH_IN_DANTE, 'DANTE')

df = pd.concat([df[[ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA, TYPE_COLUMN]],
                df_dante[[ORIGINAL_COLUMN_DANTE, TRANSLATE_COLUMN_DANTE, TYPE_COLUMN]]])

# df = df.append(df_train)

df = df.reset_index(drop=True)

filename_path_out = os.path.abspath(os.path.join(DATA_PATH_OUT, file_out))

df.to_csv(filename_path_out, index=False, columns=[TYPE_COLUMN, ORIGINAL_COLUMN_ITA, TRANSLATE_COLUMN_ITA], encoding='utf-8')

File name                          : francese-italiano.tsv
Esempi nel Dataset                 : 82704
Frase più corta in Original        : Ah !
Frase più corta in Translate       : No?
Frase più lunga in Original        : De même que, en temps de guerre, officiers et soldats se sentent autorisés par l’opinion générale à commettre des actes qui, en temps de paix, sont tenus pour criminels, de même les révolutionnaires, dans leur lutte, se regardaient comme couverts par l’opinion de leur cercle, en vertu de laquelle les actes de cruauté qu’ils commettaient étaient nobles et moraux, étant commis par eux au prix de leur liberté, de leur vie, de tout ce qui est cher à la plupart des hommes. Ainsi s’expliquait, que des personnes excellentes, incapables non seulement de causer une souffrance, mais même d’en supporter la vue, pussent se préparer tranquillement à la violence et au meurtre, et professer la sainteté de tels actes, considérés comme moyens de défense, ou encore comme instrument uti

In [61]:
df

Unnamed: 0,Original,Translate,Type
0,"De même que, en temps de guerre, officiers et ...","Così come in tempo di guerra, ufficiali e sold...",ITA
1,Étant donné que les non-natifs anglophones qui...,Dato che gli anglofoni non-madrelingua che par...,ITA
2,En prenant l'habitude de mettre et de lire des...,Abituandosi a mettere e leggere emoticon e alt...,ITA
3,Les États-Unis ont plusieurs fois justifié des...,Gli Stati Uniti hanno ripetutamente giustifica...,ITA
4,"Avec le nouveau Pape Innocent VIII, les Médici...","Con il nuovo pontefice, Innocenzo VIII, i Medi...",ITA
...,...,...,...
270353,et était dans une grande attente.,"E sì ne stava in sì gran sospezone,",DANTE
270354,que tous ses gens regardaient.,Che•lla sua giente tuttor vi veghiava.,DANTE
270355,Belle compréhension gardée en garde à vue.,"Bella coglienza ne tenne in pregione,",DANTE
270356,car il n'avait pas confiance en elle.,"Perch’ella punto in lei non si fidava,",DANTE


In [None]:
verifica_correttezza(df)

Esempi nel Dataset                 : 270358
Frase più corta in Original        : Ja.
Frase più corta in Translate       : No?
Frase più lunga in Original        : There is no such thing, at this stage of the world’s history in The United States of America, as an independent press. You know it and I know it. There is not one of you who dare write your honest opinions, and if you did, you know beforehand that it would never appear in print. I am paid weekly for keeping my honest opinions out of the paper I am connected with. Others of you are paid similar salaries for similar things, and any of you who would be foolish as to write honest opinions would be out on the streets looking for another job. If I allowed my honest opinions to appear in one issue of my papers, before twenty-four hours my occupation would be gone. The business of the journalist is to destroy the truth, to lie outright, to pervert, to vilify, to fawn at the feet of Mammon, and to sell his country and his race for his