# 2. Translate the input products

## 2.1 Import the required libaries

In [12]:
from deep_translator import GoogleTranslator
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

translator = GoogleTranslator(source='auto', target='en')

## 2.2 Load in the dataset

### Base data

In [16]:
# Obtain the file from the Azure Cloud Container
base_file_location = "../../data/example_data/output/base_data/base_typo_corrected_products.csv"

# Read the data into a DataFrame
base_corrected_df = pd.read_csv(base_file_location)

# Display the dataframe
display(base_corrected_df)

Unnamed: 0,typo_corrected
0,transfer in spain
1,coach hire france
2,ecological packaging
3,laser ablation
4,plastic litter bins
...,...
32058,asbestos
32059,fruit jelly
32060,precision weights for scales
32061,manicure scissors


In [18]:
base_corrected_df["typo_corrected"] = base_corrected_df["typo_corrected"].str.lower().str.replace("\.$", "")

  italy_corrected_df["typo_corrected"] = italy_corrected_df["typo_corrected"].str.lower().str.replace("\.$", "")


### new Italy data

In [20]:
base_corrected_df.to_csv(base_file_location, index=False)

In [14]:
# Obtain the directory of the new Italy data
italy_file_location = "../../data/example_data/output/italy_data/italy_typo_corrected_products.csv"

# Read the data into a DataFrame
italy_corrected_df = pd.read_csv(italy_file_location)

# Display the dataframe
display(italy_corrected_df)

Unnamed: 0,products_and_services,products_id,to_process,language (ISO-code),typo_corrected
0,rossoro,2b898f73-e699-4b52-926d-476b51dacd42,True,it,rossoro
1,commerciante,4d813cfb-e2ac-4458-a8c4-6ed785df7d5e,True,pt,commerciante
2,calabrian cuisine,f8592b25-a1ed-41dd-818e-b77b17a09589,True,nl,calabrian cuisine
3,sell bevarage,29a6fd6c-b361-467f-b5f4-8ee75e254ba6,True,hi,sell bevarage
4,grossista di fiori di canapa e derivati,ab9850c2-cd93-418e-a0b7-4e7a6a664f45,True,it,grossista di fiori di canapa e derivati
...,...,...,...,...,...
3430,enamels,77890bab-e5c2-40d1-bcf7-f11fb9ac8c09,False,,enamels
3431,breathable and mould resistant plastic coverings,d675d682-018e-4829-ac66-e61155d7a45e,False,,breathable and mould resistant plastic coverings
3432,quartz paints,e5f78c8c-b79e-4a12-8451-867349f09cc6,False,,quartz paints
3433,carpet,1f4814fc-759f-49b6-b2cc-9ba1c77582bd,False,,carpet


## 2.3 Translate dataframe

### Helper functions

In [5]:
def translate_Google(text):
    """
    This function translates the text into English using Google Translator
    """
    
    try:
        translated = translator.translate(text)
        return translated
    except:
        return np.nan

### Google translator module

In [7]:
def translate_df(df):
    """
    This function translates the dataframe into English using Google Translator

    Args:
        df (pd.DataFrame): The dataframe to be translated.
    Returns:
        translated_df (pd.DataFrame): The translated dataframe.
    """
    # then take subset of english texts
    print("Taking subset of non-english texts...")
    # filter out non-english texts and text that do not have a language code
    non_english_df = df[(df["language (ISO-code)"].isnull() == False) & (df["language (ISO-code)"] != "en")]
    # exclude the rows from non_english_df from the original df
    df = df[~df.index.isin(non_english_df.index)]

    # apply typo correction to english texts
    print("Applying translation...")
    non_english_df = non_english_df.copy()
    non_english_df.loc[:, 'translated_text'] = non_english_df['typo_corrected'].progress_apply(translate_Google)

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...\n")
    df = pd.concat([df, non_english_df], ignore_index=True)
    
    # replace empty values in translated column with the typo corrected text
    df["translated_text"].fillna(df["typo_corrected"], inplace=True)
    translated_df = df.copy().drop(columns=["typo_corrected", "language (ISO-code)", "to_process"]).rename(columns={"products_and_services":"raw_products_and_services","translated_text": "products_and_services"})
    return translated_df

### Base Data

In [8]:
base_translated_df = translate_df(base_corrected_df)

Taking subset of non-english texts...
Applying translation...


  0%|          | 0/2856 [00:00<?, ?it/s]

100%|██████████| 2856/2856 [45:47<00:00,  1.04it/s] 

Merging the corrected english texts with the original df...






In [9]:
output_path_base_translated = "../../data/example_data/output/base_data/base_translated_products.csv"
base_translated_df.to_csv(output_path_base_translated)

### new Italy Data

In [10]:
italy_translated_df = translate_df(italy_corrected_df)

Taking subset of non-english texts...
Applying translation...


  0%|          | 0/229 [00:00<?, ?it/s]

100%|██████████| 229/229 [02:48<00:00,  1.36it/s]

Merging the corrected english texts with the original df...






In [11]:
output_path_italy_translated = "../../data/example_data/output/italy_data/italy_translated_products.csv"
italy_translated_df.to_csv(output_path_italy_translated)