In [28]:
from typing import Sequence, List, Union, Dict
from deep_translator import GoogleTranslator

import pandas as pd
import numpy as np
import pathlib
import re
import random

In [29]:
class DataTranslator(object):
    """
    Data Translator Class.
    Uses the Google API library.
    """

    def __init__(self) -> None:
        self._translator = GoogleTranslator(source="auto", target="en")

    def translate(self, text_batch: Sequence[str]) -> List[str]:
        if not isinstance(text_batch, (list, tuple, set)):
            raise ValueError("An iterable it's expected")

        translated_batch = self._translator.translate_batch(
            batch=text_batch
        )

        return translated_batch

In [30]:
def data_pipeline(folder: str, size_of_batch: int) -> Dict[str, pd.DataFrame]:
    # Path to folders.
    path = pathlib.Path(folder)

    # Search all csv in the current directory.
    files = [f.name for f in path.glob('**/*.csv')]

    # Dict of pd.DataFrames.
    dataframes = {}
    

    # Iterate over the files.
    for file in files:
        f = next(path.glob(f'{file}'))
        # Read the csv file.
        data = pd.read_csv(f, encoding='iso-8859-1')
        # Drop non values in rating.
        data = data[data['nps'].isna() == False]
        # Create auxiliar labels.
        data['labels'] = data['nps'].astype(str)
        # Sample random.
        data = data.sample(n=size_of_batch, weights='labels')
        # Reset index.
        data.reset_index(drop=True, inplace=True)
        # Format the columns.
        data.columns = data.columns.str.lower()
        # Drop the columns without comment.
        data = data[data['comment'].isna() == False]
        # Handle the datatypes.
        data['comment'] = data['comment'].apply(lambda x: str(x).strip())
        # Match all string/numeric ones.
        pattern = '^[0-9]+$'
        # Filter out the number strings.
        data = data[data['comment'].str.contains('^[0-9]+$') == False]
        # Sort the values by date.
        # data = data.sort_values(by=['date']).reset_index(drop=True)

        if size_of_batch > data.shape[0]:
            size_of_batch = data.shape[0]

        # Text batch.
        text_batch = data['comment'].to_list()
        # [0:size_of_batch]
        # Instance the DataTranslator
        translator = DataTranslator()
        # Translate the data.
        text_translated = translator.translate(text_batch=text_batch)

        # Create the translated DataFrame.
        translated_data = pd.DataFrame(data={'comment_translated': text_translated})
        # Join to transformed DataFrame.
        trans_data = data.join(translated_data)
        # .iloc[0:size_of_batch].join(translated_data)

        dataframes.update(
            {
                file: trans_data
            }
        )

    return dataframes

In [31]:
data_translated = data_pipeline(folder="/home/aargaez/downloads/Datathon/data_test", size_of_batch=500)

KeyboardInterrupt: 

In [None]:
for label, data in data_translated.items():
    data.to_csv(f'Translated_{label}', index=False)

In [None]:
english_data = pd.read_csv('../data-translated/Manage_my_booking.csv', encoding='iso-8859-1')
original_data = pd.read_csv('../data/Manage_my_booking.csv', encoding='iso-8859-1')
original_data.iloc[0:2500].join(english_data)

In [None]:
original_data.iloc[0:2500].join(english_data)

Unnamed: 0,Date_left,nps_left,Comment_left,Date_right,nps_right,Comment_right
0,02-01-23 01:00,10,Rapidez y facil accesibilidad,2/28/2023 16:30,0,No puede ser posible que un cambio de vuelos c...
1,02-01-23 01:17,9,por bueno,2/28/2023 19:19,1,Ya tenia un asiento asignado y tramposamente l...
2,02-01-23 01:43,10,"MUY FACIL DE CONSEGUIR LA INFORMACION, BIEN EX...",2/28/2023 16:30,0,No puede ser posible que un cambio de vuelos c...
3,02-01-23 02:08,5,ESTA MUY MAL LA PAGINA PORQUE NO ME DEJO HACER...,2/28/2023 20:40,8,no agregando las cosas automáticamente y elimi...
4,02-01-23 02:41,6,Selección de asientos,2/28/2023 19:48,8,Es importante que cuando ya se pagaron ciertos...
...,...,...,...,...,...,...
4995,4/15/2023 4:03,10,Porque siempre estan en comunicación,2/28/2023 16:30,0,No puede ser posible que un cambio de vuelos c...
4996,4/15/2023 4:36,10,Hasta este momento me han dado un buen servicio,2/28/2023 16:30,0,No puede ser posible que un cambio de vuelos c...
4997,4/15/2023 4:41,10,Por lo fácil que es tener el acceso y la infor...,2/28/2023 16:30,0,No puede ser posible que un cambio de vuelos c...
4998,4/15/2023 4:52,3,precios,2/28/2023 23:32,7,SON DEMASIADOS CARGOS ADICIONALES


In [None]:
english_data = pd.read_csv('/home/aargaez/downloads/Datathon/data-cleaning/Checking.csv')
original_data = pd.read_csv('/home/aargaez/downloads/Datathon/data-english/Checking.csv', encoding='iso-8859-1')
original_data.sort_values(by=['Date'], inplace=True)
original_data = original_data.reset_index(drop=True)
original_data.iloc[0:2000].join(english_data)