In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import re
import glob
import pandas as pd
from natsort import natsorted
import unicodedata

### Variabili Globali

In [49]:
root_folder = 'drive/MyDrive/BERT/'
data_folder_in = 'data/Traduzioni/'
data_folder_out = 'data/'

# PATH
DATA_PATH_IN = os.path.abspath(os.path.join(root_folder, data_folder_in))
DATA_PATH_OUT = os.path.abspath(os.path.join(root_folder, data_folder_out))

# File utilizzati per l'addestramento
data_filename = ['francese-italiano.tsv',
                 'inglese-italiano.tsv',
                 'olandese-italiano.tsv',
                 'spagnolo-italiano.tsv',
                 'svedese-italiano.tsv',
                 'tedesco-italiano.tsv'
                 ]

# Colonne
ORIGINAL_COLUMN = 'Original'
TRANSLATE_COLUMN = 'Translate'

file_out = 'train_data.csv'
NUM_SAMPLES = 20000

In [50]:
def verifica_correttezza(df):
  original = df['Original']
  translate = df['Translate']

  print(f'Esempi nel Dataset                 : {len(df)}')
  print(f'Frase più corta in Original        : {min(original, key = len)}')
  print(f'Frase più corta in Translate       : {min(translate, key = len)}')
  print(f'Frase più lunga in Original        : {max(original, key = len)}')
  print(f'Frase più lunga in Translate       : {max(translate, key = len)}')
  print('---------------------------------------------------------------------------------------')

In [51]:
def union_df_train():
  df = pd.DataFrame(columns=[ORIGINAL_COLUMN, TRANSLATE_COLUMN])
  
  for filename in data_filename:
    filename_path = os.path.abspath(os.path.join(DATA_PATH_IN, filename))

    df_train = pd.read_csv(
      filename_path,
      sep="\t",
      header=None,
      names=[ORIGINAL_COLUMN, TRANSLATE_COLUMN],
      usecols=[1,3]
    )

    df_train = df_train.sort_values(by=ORIGINAL_COLUMN, 
                                    key=lambda x: x.str.len(),
                                    ascending=False)

    df_train = df_train[:NUM_SAMPLES]      

    print(f'File name                          : {filename}')
    verifica_correttezza(df_train)

    df = df.append(df_train)

  return df.reset_index(drop=True)

In [52]:
# Caricamento dataset
df = union_df_train()

filename_path_out = os.path.abspath(os.path.join(DATA_PATH_OUT, file_out))

df.to_csv(filename_path_out, index=False, columns=[ORIGINAL_COLUMN, TRANSLATE_COLUMN], encoding='utf-8')

File name                          : francese-italiano.tsv
Esempi nel Dataset                 : 20000
Frase più corta in Original        : Quelle est la durée de la garantie ?
Frase più corta in Translate       : Sta pranzando.
Frase più lunga in Original        : De même que, en temps de guerre, officiers et soldats se sentent autorisés par l’opinion générale à commettre des actes qui, en temps de paix, sont tenus pour criminels, de même les révolutionnaires, dans leur lutte, se regardaient comme couverts par l’opinion de leur cercle, en vertu de laquelle les actes de cruauté qu’ils commettaient étaient nobles et moraux, étant commis par eux au prix de leur liberté, de leur vie, de tout ce qui est cher à la plupart des hommes. Ainsi s’expliquait, que des personnes excellentes, incapables non seulement de causer une souffrance, mais même d’en supporter la vue, pussent se préparer tranquillement à la violence et au meurtre, et professer la sainteté de tels actes, considérés comme moyens

In [53]:
df

Unnamed: 0,Original,Translate
0,"De même que, en temps de guerre, officiers et ...","Così come in tempo di guerra, ufficiali e sold..."
1,Étant donné que les non-natifs anglophones qui...,Dato che gli anglofoni non-madrelingua che par...
2,En prenant l'habitude de mettre et de lire des...,Abituandosi a mettere e leggere emoticon e alt...
3,Les États-Unis ont plusieurs fois justifié des...,Gli Stati Uniti hanno ripetutamente giustifica...
4,"Avec le nouveau Pape Innocent VIII, les Médici...","Con il nuovo pontefice, Innocenzo VIII, i Medi..."
...,...,...
95114,Er ist ein guter Sänger.,Lui è un bravo cantante.
95115,Ich bin noch zu schwach.,Sono ancora troppo debole.
95116,Du bist nicht ersetzbar.,Non sei sostituibile.
95117,Wir haben einen Vertrag.,Abbiamo un contratto.


In [54]:
verifica_correttezza(df)

Esempi nel Dataset                 : 95119
Frase più corta in Original        : Ja.
Frase più corta in Translate       : Sì.
Frase più lunga in Original        : There is no such thing, at this stage of the world’s history in The United States of America, as an independent press. You know it and I know it. There is not one of you who dare write your honest opinions, and if you did, you know beforehand that it would never appear in print. I am paid weekly for keeping my honest opinions out of the paper I am connected with. Others of you are paid similar salaries for similar things, and any of you who would be foolish as to write honest opinions would be out on the streets looking for another job. If I allowed my honest opinions to appear in one issue of my papers, before twenty-four hours my occupation would be gone. The business of the journalist is to destroy the truth, to lie outright, to pervert, to vilify, to fawn at the feet of Mammon, and to sell his country and his race for his 