# 2. Translate the input products

## 2.1 Import the required libaries

In [2]:
from deep_translator import GoogleTranslator # this package enables the translation of text into another language
from googletrans import Translator # this package enables the translation of text into another language
from transformers import pipeline
from langdetect import detect_langs, DetectorFactory
from textblob import TextBlob
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

nlp = spacy.load('en_core_web_sm') 
language_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection") # this model is 1.1 gigabyte so it will take around 5 mins to download it
translator = Translator()

DetectorFactory.seed = 0

## 2.2 Load in the dataset

### Base data

In [None]:
# Obtain the file from the Azure Cloud Container
base_file_location = "../../data/example_data/output/base_data/base_typo_corrected_products.csv"

# Read the data into a DataFrame
base_corrected_df = pd.read_csv(base_file_location).drop(columns=['Unnamed: 0', 'products_and_services'])

# Display the dataframe
display(base_corrected_df)

### new Italy data

In [12]:
# Obtain the directory of the new Italy data
italy_file_location = "../../data/example_data/output/italy_data/italy_typo_corrected_products.csv"

# Read the data into a DataFrame
italy_corrected_df = pd.read_csv(italy_file_location).drop(columns='Unnamed: 0')

# Display the dataframe
display(italy_corrected_df)

Unnamed: 0,products_and_services,products_id,language (ISO-code),typo_corrected
0,persian blue salt,50abde66-58b7-4fb3-a007-1077fa41a010,ca,persian blue salt
1,rossoro,2b898f73-e699-4b52-926d-476b51dacd42,pt,rossoro
2,peas,a7849fec-bf97-48b0-b89b-cb910a8d6dc8,ro,peas
3,fuel,2a782696-0317-43ad-8f2d-0560e17acd8d,es,fuel
4,tomato paste,e815f6fd-7684-4f67-8cd3-7ca65edc43c6,it,tomato paste
...,...,...,...,...
3430,food supplies for catering industry,6a296ca9-9f97-42d8-8fc6-188e575990dd,en,food supplies for watering industry
3431,pleasure craft furnishings,f6ed20f1-2188-4cd1-8dc6-d873bca28c4a,en,pleasure craft furnishing
3432,"divans, armchairs and chairs repaired",3f969e86-452e-41b6-8835-97436674c186,en,"divan, armchairs and chairs repaired"
3433,breathable and mould resistant plastic coverings,d675d682-018e-4829-ac66-e61155d7a45e,en,breathable and mould resistant plastic coverings


## 2.3 Translate dataframe

### Helper functions

In [9]:
def translate_Google(text):
    """
    This function translates the text into English using Google Translator
    """
    try:
        translated = translator.translate(text).text
        return translated
    except:
        return np.nan

### Google translator module

In [10]:
def translate_df(df):
    """
    This function translates the dataframe into English using Google Translator
    """
    # then take subset of english texts
    print("Taking subset of non-english texts...")
    non_english_df = df[df["language (ISO-code)"] != "en"]
    # exclude enlgish texts from the original df
    df = df[df["language (ISO-code)"] == "en"]

    # apply typo correction to english texts
    print("Applying translation...")
    non_english_df = non_english_df.copy()
    non_english_df.loc[:, 'translated_text'] = non_english_df['typo_corrected'].progress_apply(translate_Google)

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...")
    df = pd.concat([df, non_english_df], ignore_index=True)
    
    # replace empty values in translated column with the typo corrected text
    df["translated_text"].fillna(df["typo_corrected"], inplace=True)
    translated_df = df.copy().drop(columns=["typo_corrected", "language (ISO-code)", "products_and_services"]).rename(columns={"translated_text": "products_and_services"})
    return translated_df

### Base Data

In [None]:
base_translated_df = translate_df(base_corrected_df)

In [None]:
output_path_base_translated = "../../data/example_data/output/base_data/base_translated_products.csv"
base_translated_df.to_csv(output_path_base_translated)

### new Italy Data

In [13]:
italy_translated_df = translate_df(italy_corrected_df)

Taking subset of non-english texts...
Applying translation...


100%|██████████| 1503/1503 [00:41<00:00, 36.13it/s]

Merging the corrected english texts with the original df...





In [14]:
output_path_italy_translated = "../../data/example_data/output/italy_data/italy_translated_products.csv"
italy_translated_df.to_csv(output_path_italy_translated)