In [None]:
from typing import Sequence, List, Union, Dict
from deep_translator import GoogleTranslator

import pandas as pd
import numpy as np
import pathlib
import re
import random

In [None]:
class DataTranslator(object):
    """
    Data Translator Class.
    Uses the Google API library.
    """

    def __init__(self) -> None:
        self._translator = GoogleTranslator(source="auto", target="en")

    def translate(self, text_batch: Sequence[str]) -> List[str]:
        if not isinstance(text_batch, (list, tuple, set)):
            raise ValueError("An iterable it's expected")

        translated_batch = self._translator.translate_batch(
            batch=text_batch
        )

        return translated_batch

In [None]:
def data_pipeline(folder: str, size_of_batch: int) -> Dict[str, pd.DataFrame]:
    # Path to folders.
    path = pathlib.Path(folder)

    # Search all csv in the current directory.
    files = [f.name for f in path.glob('**/*.csv')]

    # Dict of pd.DataFrames.
    dataframes = {}
    

    # Iterate over the files.
    for file in files:
        f = next(path.glob(f'{file}'))
        # Read the csv file.
        data = pd.read_csv(f, encoding='iso-8859-1')
        # Drop non values in rating.
        data = data[data['Rating'].isna() == False]
        # Create auxiliar labels.
        data['labels'] = data['Rating'].astype(str)
        # Sample random.
        data = data.sample(n=size_of_batch, weights='labels')
        # Reset index.
        data.reset_index(drop=True, inplace=True)
        # Format the columns.
        data.columns = data.columns.str.lower()
        # Drop the columns without comment.
        data = data[data['comment'].isna() == False]
        # Handle the datatypes.
        data['comment'] = data['comment'].apply(lambda x: str(x).strip())
        # Match all string/numeric ones.
        pattern = '^[0-9]+$'
        # Filter out the number strings.
        data = data[data['comment'].str.contains('^[0-9]+$') == False]
        # Sort the values by date.
        # data = data.sort_values(by=['date']).reset_index(drop=True)

        if size_of_batch > data.shape[0]:
            size_of_batch = data.shape[0]

        # Text batch.
        text_batch = data['comment'].to_list()[0:size_of_batch]
        # Instance the DataTranslator
        translator = DataTranslator()
        # Translate the data.
        text_translated = translator.translate(text_batch=text_batch)

        # Create the translated DataFrame.
        translated_data = pd.DataFrame(data={'comment_translated': text_translated})
        # Join to transformed DataFrame.
        trans_data = data.iloc[0:size_of_batch].join(translated_data)

        dataframes.update(
            {
                file: trans_data
            }
        )

    return dataframes

In [None]:
data_translated = data_pipeline(folder="/home/aargaez/downloads/Datathon/data_test", size_of_batch=100)

In [None]:
# data_translated['Feed_back_button.csv']

In [None]:
for label, data in data_translated.items():
    data.to_csv(f'Translated_{label}', index=False)