# 2. Translate the input products

## 2.1 Import the required libaries

In [1]:
from deep_translator import GoogleTranslator # this package enables the translation of text into another language
from googletrans import Translator # this package enables the translation of text into another language
from transformers import pipeline
from langdetect import detect_langs, DetectorFactory
from textblob import TextBlob
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

nlp = spacy.load('en_core_web_sm') 
language_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection") # this model is 1.1 gigabyte so it will take around 5 mins to download it
translator = Translator()

DetectorFactory.seed = 0




## 2.2 Load in the dataset

### Base data

In [None]:
# Obtain the file from the Azure Cloud Container
file_location = "../../data/example_data/output/corrected_products.csv"

# Read the data into a DataFrame
corrected_df = pd.read_csv(file_location).drop(columns=['Unnamed: 0', 'products_and_services'])

# Display the dataframe
display(corrected_df)

### new Italy data

In [8]:
# Obtain the directory of the new Italy data
italy_tilt_data_location = "../../data/example_data/output/italy_data/italy_typo_corrected_products.csv"

# Read the data into a DataFrame
italy_tilt_data = pd.read_csv(italy_tilt_data_location).drop(columns='Unnamed: 0')

# Display the dataframe
display(italy_tilt_data)

Unnamed: 0,products_and_services,products_id,language (ISO-code),typo_corrected
0,agriculture - machines & equipment,941c038a-3547-4985-835d-6c82a56090f8,fr,agriculture - machines & equipment
1,agricultural systems and equipment,98d6e96c-3d4f-4d33-940e-3444d9f725c8,ca,agricultural systems and equipment
2,natural fertilizers,77f15f2e-24e8-473a-9944-a9a84d372d91,ro,natural fertilizers
3,organic manures,e7a9f076-34c9-4e31-9402-2bd81e35d62e,ro,organic manures
4,insecticides for agriculture,d4bee171-af70-47ad-8c7b-a9e1164e9243,ca,insecticides for agriculture
...,...,...,...,...
14572,breathable and mould resistant plastic coverings,50009802-62cf-41ba-bec1-c7b91b5fa5e7,en,breathable and mould resistant plastic coverings
14573,articles for painting,35bcd44c-b231-4540-9c60-7a425fa63251,en,articles for painting
14574,"wall-coverings, fabric",9c4c22f5-6c84-402e-91d3-b317896e8650,en,"wall-coverings, fabric"
14575,"wall-coverings, fabric",a3664d9f-c777-4193-8707-417b633d9a4e,en,"wall-coverings, fabric"


## 2.3 Translate dataframe

### Helper functions

In [9]:
def translate_Google(text):
    """
    This function translates the text into English using Google Translator
    """
    try:
        translated = translator.translate(text).text
        return translated
    except:
        return np.nan

### Google translator module

In [25]:
def translate_df(df):
    """
    This function translates the dataframe into English using Google Translator
    """
    # then take subset of english texts
    print("Taking subset of non-english texts...")
    non_english_df = df[df["language (ISO-code)"] != "en"]
    # exclude enlgish texts from the original df
    df = df[df["language (ISO-code)"] == "en"]

    # apply typo correction to english texts
    print("Applying translation...")
    non_english_df = non_english_df.copy()
    non_english_df.loc[:, 'translated_text'] = non_english_df['typo_corrected'].progress_apply(translate_Google)

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...")
    df = pd.concat([df, non_english_df], ignore_index=True)
    
    # replace empty values in translated column with the typo corrected text
    df["translated_text"].fillna(df["typo_corrected"], inplace=True)
    translated_df = df.copy().drop(columns=["typo_corrected", "language (ISO-code)", "products_and_services"]).rename(columns={"translated_text": "products_and_services"})
    return translated_df

### Base Data

### new Italy Data

In [26]:
italy_translated_df = translate_df(italy_tilt_data)

Taking subset of non-english texts...
Applying translation...


100%|██████████| 5852/5852 [02:38<00:00, 36.85it/s]

Merging the corrected english texts with the original df...





In [29]:
output_path_translated = "../../data/example_data/output/italy_data/italy_translated_products.csv"
italy_translated_df.to_csv(output_path_translated)