In [1]:
!pip install transformers
!pip install sacrebleu sentencepiece
!pip install huggingface_hub
!pip install sacremoses

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 43.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 42.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import re
import glob
import pandas as pd
from natsort import natsorted

from huggingface_hub import notebook_login
from transformers import pipeline

In [4]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


## Traduzione

Configuro modelli HuggingFace per la traduzione da Inglese, Francese e Tedesco in Italiano

In [5]:
# Modello Traduzione Inglese-Italiano
en_model_checkpoint = "Helsinki-NLP/opus-mt-en-it"
translator_en = pipeline("translation", model=en_model_checkpoint)

# Modello Traduzione Francese-Italiano
fr_model_checkpoint = "Helsinki-NLP/opus-tatoeba-fr-it"
translator_fr = pipeline("translation", model=fr_model_checkpoint)

# Modello Traduzione Tedesco-Italiano
de_model_checkpoint = "Helsinki-NLP/opus-mt-de-it"
translator_de = pipeline("translation", model=de_model_checkpoint)

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/343M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/789k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/203M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/819k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/821k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/302M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

### Configurazione classe per traduzione

In [6]:
class Translate:
  def __init__(self,
               orig_filenamepath, 
               trad_filenamepath, 
               translator,
               path_out):
    
    self.orig_filenamepath = orig_filenamepath
    self.trad_filenamepath = trad_filenamepath

    self.translator = translator

    self.path_out = path_out

  def text_cleaning(self, text):
    text = re.sub('-', '', ' '.join(text))
    text = re.sub('\([^)]*\)', '', text)
    text = re.split('[.!?;]', text)

    return text

  def translate(self):
    df_trad = pd.DataFrame(columns=['Original', 'Translate_IT'])

    for file_orig, file_trad in zip(natsorted(glob.glob(self.orig_filenamepath)), 
                                    natsorted(glob.glob(self.trad_filenamepath))):

      df_orig = pd.read_csv(file_orig, 
                            header=None, 
                            sep='\\n', 
                            names=['Original'], 
                            engine='python')
            
      df_trad_en = pd.read_csv(file_trad, 
                              header=None, 
                              sep='\\n', 
                              names=['Translate_from'], 
                              engine='python')
      
      trad_en = self.translator([text for text in df_trad_en['Translate_from']])

      df_trad['Original'] = df_orig['Original']
      df_trad['Translate_IT'] = pd.DataFrame([text['translation_text'] for text in trad_en[:]]).dropna()

      number = ((file_trad.split('/')[-1]).split('.')[0]).split('_')[-1]
      
      df_trad.to_csv(self.path_out + str(number) + '.csv', index=False, encoding='utf-8')

### Traduzione Paradiso

In [7]:
# PARAMETRI GLOBALI
root_folder = 'drive/MyDrive/Traduzione_Dantesca/Opere/Dante/'

# ORIGINALE
data_folder_name_original = 'Originale'
filenamepath_original = 'ORIG_DIVINA_COMMEDIA_PARADISO_*.txt'
DATA_PATH_ORIGINAL = os.path.abspath(os.path.join(root_folder, data_folder_name_original))
original_filenamepath = os.path.abspath(os.path.join(DATA_PATH_ORIGINAL, filenamepath_original))

# DATI
data_folder_name_traduzione = 'Traduzione'
filenamepath_traduzione_en = 'TRAD_3_DIVINA_COMMEDIA_PARADISO_EN_*.txt'
filenamepath_traduzione_fr = 'TRAD_4_DIVINA_COMMEDIA_PARADISO_FR_*.txt'
filenamepath_traduzione_de = 'TRAD_4_DIVINA_COMMEDIA_PARADISO_DE_*.txt'
filenamepath_traduzione_en_2 = 'TRAD_5_DIVINA_COMMEDIA_PARADISO_*.txt'
filenamepath_traduzione_en_3 = 'TRAD_6_DIVINA_COMMEDIA_PARADISO_*.txt'

DATA_PATH_TRANSLATE = os.path.abspath(os.path.join(root_folder, data_folder_name_traduzione))
trad_filenamepath_en = os.path.abspath(os.path.join(DATA_PATH_TRANSLATE, filenamepath_traduzione_en))
trad_filenamepath_fr = os.path.abspath(os.path.join(DATA_PATH_TRANSLATE, filenamepath_traduzione_fr))
trad_filenamepath_de = os.path.abspath(os.path.join(DATA_PATH_TRANSLATE, filenamepath_traduzione_de))
trad_filenamepath_en_2 = os.path.abspath(os.path.join(DATA_PATH_TRANSLATE, filenamepath_traduzione_en_2))
trad_filenamepath_en_3 = os.path.abspath(os.path.join(DATA_PATH_TRANSLATE, filenamepath_traduzione_en_3))

trad_filenamepath = [trad_filenamepath_en, trad_filenamepath_en_2, trad_filenamepath_en_3]

data_folder_out = 'drive/MyDrive/Traduzione_Dantesca/data/'
path_out_1 = os.path.abspath(os.path.join(data_folder_out, 'trad_divina_commedia_paradiso_1_'))
path_out_2 = os.path.abspath(os.path.join(data_folder_out, 'trad_divina_commedia_paradiso_2_'))
path_out_3 = os.path.abspath(os.path.join(data_folder_out, 'trad_divina_commedia_paradiso_3_'))
path_out_4 = os.path.abspath(os.path.join(data_folder_out, 'trad_divina_commedia_paradiso_4_'))
path_out_5 = os.path.abspath(os.path.join(data_folder_out, 'trad_divina_commedia_paradiso_5_'))

path_out = [path_out_1, path_out_2, path_out_3]

In [16]:
for trad_filename, file_path_out in zip(trad_filenamepath, path_out) :
  translate_paradiso = Translate(orig_filenamepath=original_filenamepath,
                                trad_filenamepath=trad_filename,
                                translator=translator_en,
                                path_out=file_path_out)

  translate_paradiso.translate()

In [None]:
translate_paradiso = Translate(orig_filenamepath=original_filenamepath,
                              trad_filenamepath=trad_filenamepath_fr,
                              translator=translator_fr,
                              path_out=path_out_4)

translate_paradiso.translate()

In [None]:
translate_paradiso = Translate(orig_filenamepath=original_filenamepath,
                              trad_filenamepath=trad_filenamepath_de,
                              translator=translator_de,
                              path_out=path_out_5)

translate_paradiso.translate()