# 2. Translate the input products

## 2.1 Import the required libaries

In [0]:
import pandas as pd
import numpy as np
import os

from deep_translator import GoogleTranslator
from tqdm import tqdm

tqdm.pandas()

translator = GoogleTranslator(source='auto', target='en')

## 2.2 Load in the dataset

In [0]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    # Obtain the file from the Azure Cloud Container
    base_file_location = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/base_data/base_typo_corrected_products.csv"
    # Obtain the directory of the new Italy data
    italy_file_location = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/italy_data/italy_typo_corrected_products.csv"

    # Read the data into a DataFrame
    base_corrected_df = spark.read.option("header", "true").option("inferSchema","true").csv(base_file_location).toPandas()
    # Read the data into a DataFrame
    italy_corrected_df = spark.read.option("header", "true").option("inferSchema","true").csv(italy_file_location).toPandas()

else:
    # Obtain the file from the Azure Cloud Container
    base_file_location = "../../data/example_data/output/base_data/base_typo_corrected_products.csv"
    # Obtain the directory of the new Italy data
    italy_file_location = "../../data/example_data/output/italy_data/italy_typo_corrected_products.csv"

    # Read the data into a DataFrame
    base_corrected_df = pd.read_csv(base_file_location)
    # Read the data into a DataFrame
    italy_corrected_df = pd.read_csv(italy_file_location)

## 2.3 Translate dataframe

### Helper functions

In [0]:
def translate_Google(text):
    """
    This function translates the text into English using Google Translator
    """
    
    try:
        translated = translator.translate(text)
        return translated
    except:
        return np.nan

### Google translator module

In [0]:
def translate_df(df):
    """
    This function translates the dataframe into English using Google Translator

    Args:
        df (pd.DataFrame): The dataframe to be translated.
    Returns:
        translated_df (pd.DataFrame): The translated dataframe.
    """
    # then take subset of english texts
    print("Taking subset of non-english texts...")
    # filter out non-english texts and text that do not have a language code
    non_english_df = df[(df["language (ISO-code)"].isnull() == False) & (df["language (ISO-code)"] != "en")]
    # exclude the rows from non_english_df from the original df
    df = df[~df.index.isin(non_english_df.index)]

    # apply typo correction to english texts
    print("Applying translation...")
    non_english_df = non_english_df.copy()
    non_english_df.loc[:, 'translated_text'] = non_english_df['typo_corrected'].progress_apply(translate_Google)

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...\n")
    df = pd.concat([df, non_english_df], ignore_index=True)
    
    # replace empty values in translated column with the typo corrected text
    df["translated_text"].fillna(df["typo_corrected"], inplace=True)
    translated_df = df.copy().drop(columns=["typo_corrected", "language (ISO-code)", "to_process"]).rename(columns={"products_and_services":"raw_products_and_services","translated_text": "products_and_services"})
    return translated_df

### Base Data

In [0]:
base_translated_df = translate_df(base_corrected_df)

### new Italy Data

In [0]:
italy_translated_df = translate_df(italy_corrected_df)

## 2.4 Export the dataframe with the translated text

In [0]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    # Define the path for the new dataframe
    output_path_base_translated = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/base_data/base_translated_products.csv"
    # Define the path for the new dataframe
    output_path_italy_translated = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/italy_data/italy_translated_products.csv"

    # Convert the pandas dataframe to a spark sql dataframe
    base_translated_spark = spark.createDataFrame(base_translated_df)
    # Convert the pandas dataframe to a spark sql dataframe
    italy_typo_corrected_spark = spark.createDataFrame(italy_translated_df)

    # Write the new dataframe to the path
    base_translated_spark.write.csv(output_path_base_translated, mode="overwrite", header=True)
    # Write the new dataframe to the path
    italy_typo_corrected_spark.write.csv(output_path_italy_translated, mode="overwrite", header=True)
else:
    output_path_base_translated = "../../data/example_data/output/base_data/base_translated_products.csv"
    output_path_italy_translated = "../../data/example_data/output/italy_data/italy_translated_products.csv"
    base_translated_df.to_csv(output_path_base_translated)
    italy_translated_df.to_csv(output_path_italy_translated)