In [1]:
from typing import Sequence, List, Union, Dict
from deep_translator import GoogleTranslator

import pandas as pd
import numpy as np
import pathlib
import re

In [2]:
class DataTranslator(object):
    """
    Data Translator Class.
    Uses the Google API library.
    """

    def __init__(self) -> None:
        self._translator = GoogleTranslator(source="auto", target="en")

    def translate(self, text_batch: Sequence[str]) -> List[str]:
        if not isinstance(text_batch, (list, tuple, set)):
            raise ValueError("An iterable it's expected")

        translated_batch = self._translator.translate_batch(
            batch=text_batch
        )

        return translated_batch

In [3]:
def data_pipeline(folder: str) -> Dict[str, pd.DataFrame]:
    # Path to folders.
    path = pathlib.Path(folder)

    # Search all csv in the current directory.
    files = [f.name for f in path.glob('**/*.csv')]

    # Dict of pd.DataFrames.
    dataframes = {}
    

    # Iterate over the files.
    for file in files:
        f = next(path.glob(f'{file}'))
        # Read the csv file.
        data = pd.read_csv(f, encoding='iso-8859-1')
        # Format the columns.
        data.columns = data.columns.str.lower()
        # Drop the columns without comment.
        data = data[data['comment'].isna() == False]
        # Handle the datatypes.
        data['comment'] = data['comment'].apply(lambda x: str(x).strip())
        # Match all string/numeric ones.
        pattern = '^[0-9]+$'
        # Filter out the number strings.
        data = data[data['comment'].str.contains('^[0-9]+$') == False]
        # Sort the values by date.
        data = data.sort_values(by=['date']).reset_index(drop=True)

        # Text batch.
        text_batch = data['comment'].to_list()[0:5000]
        # Instance the DataTranslator
        translator = DataTranslator()
        # Translate the data.
        text_translated = translator.translate(text_batch=text_batch)

        dataframes.update(
            {file: pd.DataFrame(data={'comment': text_translated})}
        )

    return dataframes

In [4]:
data_translated = data_pipeline(folder="/home/aargaez/downloads/Datathon/data-english")

In [5]:
for label, data in data_translated.items():
    data.to_csv(f'{label}', index=False)

In [None]:
data_translated = pd.DataFrame(data={'comment': results})

In [None]:
data = pd.read_csv(r"/home/aargaez/downloads/Datathon/data/Checking.csv", encoding='iso-8859-1')

In [None]:
# Format the columns.
data.columns = data.columns.str.lower()
# Drop the columns without comment.
data = data[data['comment'].isna() == False]
# Handle the datatypes.
data['comment'] = data['comment'].apply(lambda x: str(x).strip())
# Match all string/numeric ones.
pattern = '^[0-9]+$'
# Filter out the number strings.
data = data[data['comment'].str.contains('^[0-9]+$') == False]
# Sort the values by date.
data = data.sort_values(by=['date']).reset_index(drop=True)
# Text batch.
text_batch = data['comment'].to_list()

In [None]:
translator = DataTranslator()

In [None]:
results = translator.translate(text_batch=text_batch[1000:2000])

In [None]:
translated = pd.DataFrame(data={'comment': results})

In [None]:
translated.to_csv('Checking_3.csv', index=False)