# 2. Translate the input products

## 2.1 Import the required libaries

In [7]:
from deep_translator import GoogleTranslator
from pandarallel import pandarallel
from tqdm import tqdm

import pandas as pd
import numpy as np
import os

tqdm.pandas()

pandarallel.initialize(progress_bar=True)

translator = GoogleTranslator(source='auto', target='en')

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## 2.2 Load in the dataset

In [8]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    pass
else:
    # Obtain the file from the Azure Cloud Container
    input_file_location = "../../data/example_data/output/tilt_products_typo_corrected.csv"
    # Read the data into a DataFrame
    corrected_df = pd.read_csv(input_file_location)

## 2.3 Translate dataframe

### Helper functions

In [9]:
def translate_Google(text):
    """
    This function translates the text into English using Google Translator
    """
    
    try:
        translated = translator.translate(text)
        return translated
    except:
        return np.nan

### Google translator module

In [11]:
def translate_df(df):
    """
    This function translates the dataframe into English using Google Translator

    Args:
        df (pd.DataFrame): The dataframe to be translated.
    Returns:
        translated_df (pd.DataFrame): The translated dataframe.
    """
    # then take subset of english texts
    print("Taking subset of non-english texts...")
    # filter out non-english texts and text that do not have a language code
    non_english_df = df[(df["language (ISO-code)"].isnull() == False) & (df["language (ISO-code)"] != "en")]
    # exclude the rows from non_english_df from the original df
    df = df[~df.index.isin(non_english_df.index)]

    # apply typo correction to english texts
    print("Applying translation...")
    non_english_df = non_english_df.copy()
    non_english_df.loc[:, 'translated_text'] = non_english_df['typo_corrected'].parallel_apply(translate_Google)

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...\n")
    df = pd.concat([df, non_english_df], ignore_index=True)
    
    # replace empty values in translated column with the typo corrected text
    df["translated_text"].fillna(df["typo_corrected"], inplace=True)
    translated_df = df.copy().drop(columns=["typo_corrected", "language (ISO-code)"]).rename(columns={"products_and_services":"raw_products_and_services","translated_text": "products_and_services"})
    return translated_df

### Provided input Data

In [12]:
translated_df = translate_df(corrected_df)

Taking subset of non-english texts...
Applying translation...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=870), Label(value='0 / 870'))), HB…

Merging the corrected english texts with the original df...



## 2.4 Export the dataframe with the translated text

In [14]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    pass
else:
    output_path_translated_df = "../../data/example_data/output/tilt_products_translated.csv"
    translated_df.to_csv(output_path_translated_df)