# 1. Typo correction


## 1.1 Import the required libraries


In [2]:
from transformers import pipeline
from langdetect import detect_langs, DetectorFactory
from textblob import TextBlob
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

language_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection") # this model is 1.1 gigabyte so it will take around 5 mins to download it
translator = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base", max_length=1000)

DetectorFactory.seed = 0

## 1.2 Load in the raw data

### Base Data

In [3]:
# Determine the location of the dataframe containing the translated text
base_tilt_data_location = "../../data/example_data/input/tilt_base_products_and_services_unprocessed.csv"

# use raw_df
base_tilt_data = pd.read_csv(base_tilt_data_location)

# Display the dataframe
display(base_tilt_data)

Unnamed: 0,products_and_services,products_id
0,industrial lifting,49affd7c-7f90-4977-8f10-ae84fe6b3dcc
1,customised lifting solutions,14961ecc-341b-48b0-bc70-a3db16e0972b
2,vacuum lifting solution,1d111749-f816-419e-b45f-06bbe291f9be
3,concrete lifting solutions,c2986ffd-6fa6-4da0-94f2-b2da1c1d07e7
4,sheet metal handling,5720becf-9c80-485a-b0e4-97203117455b
...,...,...
20586,local public transport,38a4ec9f-9a2d-4628-99e5-7e8477571dd3
20587,passenger information,207854a8-a834-48a7-b397-294aed97ffb1
20588,"rescue apparatus, hydraulic",dc1ff0ac-9e44-47cf-9981-009f600d9ee0
20589,wood pellets wood briquette,9e3e3cfe-27c2-4519-b364-8deaea7f6474


### tilt Italy Data

In [3]:
# Determine the location of the dataframe containing the translated text
italy_tilt_data_location = "../../data/example_data/input/tilt_italy_products_and_services_unprocessed.csv"

# use raw_df
italy_tilt_data = pd.read_csv(italy_tilt_data_location)

# Display the dataframe
display(italy_tilt_data)

Unnamed: 0,products_and_services,products_id
0,persian blue salt,50abde66-58b7-4fb3-a007-1077fa41a010
1,organic saffron bio,0ea739bc-c3fb-4476-b8d6-96ba5085aa00
2,rossoro,2b898f73-e699-4b52-926d-476b51dacd42
3,peas,a7849fec-bf97-48b0-b89b-cb910a8d6dc8
4,cauliflowers,e09840c2-df4a-4920-b91b-e23177b5f5b3
...,...,...
3430,enamels,77890bab-e5c2-40d1-bcf7-f11fb9ac8c09
3431,breathable and mould resistant plastic coverings,d675d682-018e-4829-ac66-e61155d7a45e
3432,quartz paints,e5f78c8c-b79e-4a12-8451-867349f09cc6
3433,carpet,1f4814fc-759f-49b6-b2cc-9ba1c77582bd


## 1.3 Apply typo correction module

### Helper functions

In [4]:
def conf_ld_detect_language(text, model="def"):
    """Language detection wrapper.
    
    Returns detected language (ISO-code) and confidence of detection. In case of 
    failure of detection string 'ident_fail' and a pd.NA value for confidence is 
    returned.
    
    Args:
        text (str): The string for which language shall be detected.
        model (str): The model to be used for language detection. Defaults to langdetect model.
    Returns:
        str: The detected language (ISO-code).
    """
    try:
        if model == "def":
            highest_conf = detect_langs(text)[0]
            return highest_conf.lang
        else:
            result = language_detector(text)[0]
            return str(result["label"])
    except:   
        return "ident_fail", pd.NA

In [5]:
def typo_correction(text=""):
    """Typo correction wrapper.
    
    Returns corrected text. In case of failure of correction the original text 
    is returned. 
    
    Args:
        text (str): The string to be corrected.
    Returns:
        str: The corrected string.
    """
    try:
        # contextualspellcheck only works for english. the same applies to textblob and the huggingface pipeline. pyspellcheck does work with different languages but requires single words as input. (aka splitted)
        return(TextBlob(text).correct().string)
    except:
        return text

### Typo correction module

In [6]:
def typo_correct_df(df):
    """Typo correction wrapper for dataframes.
    
    Returns dataframe with corrected text. In case of failure of correction the 
    original text is returned. 
    
    Args:
        df (pd.DataFrame): The dataframe containing the text to be corrected.
    Returns:
        pd.DataFrame: The dataframe with corrected text.
    """
    #df['ID'] = range(1, len(df) + 1)
    # detect the language of the text
    print("Detecting the language of the text...")
    df.loc[:, "language (ISO-code)"] = df["products_and_services"].progress_apply(lambda x: conf_ld_detect_language(x))

    # then take subset of english texts
    print("Taking subset of English texts...")
    english_df = df[df["language (ISO-code)"] == "en"]
    # exclude enlgish texts from the original df
    df = df[df["language (ISO-code)"] != "en"]

    # apply typo correction to english texts
    print("Applying typo correction...")
    english_df = english_df.copy()
    english_df.loc[:, "typo_corrected"] = english_df["products_and_services"].progress_apply(lambda x: typo_correction(x))

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...")
    df = pd.concat([df, english_df], ignore_index=True)
    # replace empty values in typo_corrected with the original text
    df["typo_corrected"].fillna(df["products_and_services"], inplace=True)
    return df

### Base Data

In [7]:
base_typo_corrected_df = typo_correct_df(base_tilt_data)

Detecting the language of the text...


100%|██████████| 20591/20591 [06:44<00:00, 50.90it/s]


Taking subset of English texts...
Applying typo correction...


100%|██████████| 11380/11380 [32:36<00:00,  5.82it/s] 

Merging the corrected english texts with the original df...





### tilt Italy Data

In [9]:
italy_typo_corrected_df = typo_correct_df(italy_tilt_data)

Detecting the language of the text...


100%|██████████| 3435/3435 [01:02<00:00, 54.54it/s]


Taking subset of English texts...
Applying typo correction...


100%|██████████| 1932/1932 [05:26<00:00,  5.92it/s]

Merging the corrected english texts with the original df...





## 1.4 Export the dataframe with the corrected text 

### Base Data

In [8]:
# Define the path for the new dataframe
output_path_base_typo_corrected = "../../data/example_data/output/base_data/base_typo_corrected_products.csv"

# Write the new dataframe to the path
base_typo_corrected_df.to_csv(output_path_base_typo_corrected)

### tilt Italy Data

In [10]:
# Define the path for the new dataframe
output_path_italy_typo_corrected = "../../data/example_data/output/italy_data/italy_typo_corrected_products.csv"

# Write the new dataframe to the path
italy_typo_corrected_df.to_csv(output_path_italy_typo_corrected)