# 1. Typo correction


## 1.1 Import the required libraries


In [2]:
import pandas as pd
import torch
import os

from langdetect import detect_langs, DetectorFactory
from pyspark.sql.functions import pandas_udf
from transformers import pipeline
from textblob import TextBlob
from tqdm import tqdm

tqdm.pandas()

device = True if torch.cuda.is_available() else False
print("GPU availability:{}".format(device))

language_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection") # this model is 1.1 gigabyte so it will take around 5 mins to download it
typo_corrector = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base", max_length=1000)

DetectorFactory.seed = 0

## 1.2 Load in the raw data

In [None]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    # Determine the location of the dataframe containing the translated text
    base_tilt_data_location = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/base_data/base_flagged_products.csv"
    # Determine the location of the dataframe containing the translated text
    italy_tilt_data_location = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/italy_data/italy_flagged_products.csv"

    # use raw_df
    base_tilt_data = spark.read.option("header", "true").csv(base_tilt_data_location).toPandas()
    # use raw_df
    italy_tilt_data = spark.read.option("header", "true").csv(italy_tilt_data_location).toPandas()
else:
    # Determine the location of the dataframe containing the translated text
    base_tilt_data_location = "../../data/example_data/output/base_data/base_flagged_products.csv"
    # Determine the location of the dataframe containing the translated text
    italy_tilt_data_location = "../../data/example_data/output/italy_data/italy_flagged_products.csv"
    
    # use raw_df
    base_tilt_data = pd.read_csv(base_tilt_data_location)
    # use raw_df
    italy_tilt_data = pd.read_csv(italy_tilt_data_location)

## 1.3 Apply typo correction module

### Helper functions

In [5]:
def conf_ld_detect_language(text, model="def"):
    """Language detection wrapper.
    
    Returns detected language (ISO-code) and confidence of detection. In case of 
    failure of detection string 'ident_fail' and a pd.NA value for confidence is 
    returned.
    
    Args:
        text (str): The string for which language shall be detected.
        model (str): The model to be used for language detection. Defaults to langdetect model.
    Returns:
        str: The detected language (ISO-code).
    """
    try:
        if model == "def":
            highest_conf = detect_langs(text)[0]
            return highest_conf.lang
        elif model == "huggingface":
            result = language_detector(text)[0]
            return str(result["label"])
    except:   
        return "ident_fail", pd.NA

In [6]:
def typo_correction(text="", model="def"):
    """Typo correction wrapper.
    
    Returns corrected text. In case of failure of correction the original text 
    is returned. 
    
    Args:
        text (str): The string to be corrected.
        model (str): The model to be used for typo correction. Defaults to textblob model.
    Returns:
        str: The corrected string.
    """
    try:
        if model == "def":
            return(TextBlob(text).correct().string)
        elif model == "huggingface":
            return(typo_corrector(text)[0]["generated_text"])
    except:
        return text

### Typo correction module

In [7]:
def typo_correct_df(df):
    """Typo correction wrapper for dataframes.
    
    Returns dataframe with corrected text. In case of failure of correction the 
    original text is returned. 
    
    Args:
        df (pd.DataFrame): The dataframe containing the text to be corrected.
    Returns:
        pd.DataFrame: The dataframe with corrected text.
    """
    # detect the language of the text but only for the rows that do not have a value in the automatic_processed_products_and_services column
    print("Detecting the language of the text...")
    # only take rows that have a True value in the to_process column
    to_process_df = df[df["to_process"] == True].copy()
    # exclude to_processed_df rows from df
    df = df[df["to_process"] == False].copy()
    to_process_df.loc[:, "language (ISO-code)"] = to_process_df["products_and_services"].progress_apply(lambda x: conf_ld_detect_language(x, model="huggingface"))

    # then take subset of english texts
    print("Taking subset of English texts...")
    english_df = to_process_df[to_process_df["language (ISO-code)"] == "en"]
    # exclude enlgish texts from the original df
    to_process_df = to_process_df[to_process_df["language (ISO-code)"] != "en"]

    # apply typo correction to english texts
    print("Applying typo correction...")
    english_df = english_df.copy()
    english_df.loc[:, "typo_corrected"] = english_df["products_and_services"].progress_apply(lambda x: typo_correction(x, model="huggingface"))

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...")
    df = pd.concat([to_process_df, english_df, df], ignore_index=True)
    # replace empty values in typo_corrected with the original text
    df["typo_corrected"].fillna(df["products_and_services"], inplace=True)
    # make typo_corrected lowercase and remove all dots at the end
    df["typo_corrected"] = df["typo_corrected"].str.lower().str.replace("\.$", "")
    return df

### Base Data

In [8]:
base_typo_corrected_df = typo_correct_df(base_tilt_data)

Detecting the language of the text...


  0%|          | 0/14378 [00:00<?, ?it/s]

100%|██████████| 14378/14378 [12:43<00:00, 18.82it/s]


Taking subset of English texts...
Applying typo correction...


100%|██████████| 11522/11522 [1:08:48<00:00,  2.79it/s]

Merging the corrected english texts with the original df...



  df["typo_corrected"] = df["typo_corrected"].str.lower().str.replace("\.$", "")


### tilt Italy Data

In [9]:
italy_typo_corrected_df = typo_correct_df(italy_tilt_data)

Detecting the language of the text...


100%|██████████| 838/838 [00:34<00:00, 24.04it/s]


Taking subset of English texts...
Applying typo correction...


100%|██████████| 609/609 [03:12<00:00,  3.17it/s]

Merging the corrected english texts with the original df...



  df["typo_corrected"] = df["typo_corrected"].str.lower().str.replace("\.$", "")


## 1.4 Export the dataframe with the corrected text 

In [10]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    # Define the path for the new dataframe
    output_path_base_typo_corrected = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/base_data/base_typo_corrected_products.csv"
    # Define the path for the new dataframe
    output_path_italy_typo_corrected = "abfss://preprocessing@storagetiltdevelop.dfs.core.windows.net/data/example_data/output/italy_data/italy_typo_corrected_products.csv"

    # Convert the pandas dataframe to a spark sql dataframe
    base_typo_corrected_spark = spark.createDataFrame(base_typo_corrected_df)
    # Convert the pandas dataframe to a spark sql dataframe
    italy_typo_corrected_spark = spark.createDataFrame(italy_typo_corrected_df)

    # Write the new dataframe to the path
    base_typo_corrected_spark.write.csv(output_path_base_typo_corrected, mode="overwrite", header=True)
    # Write the new dataframe to the path
    italy_typo_corrected_spark.write.csv(output_path_italy_typo_corrected, mode="overwrite", header=True)
else:
    # Define the path for the new dataframe
    output_path_base_typo_corrected = "../../data/example_data/output/base_data/base_typo_corrected_products.csv"
    # Define the path for the new dataframe
    output_path_italy_typo_corrected = "../../data/example_data/output/italy_data/italy_typo_corrected_products.csv"

    # Write the new dataframe to the path
    base_typo_corrected_df.to_csv(output_path_base_typo_corrected, index=False)
    # Write the new dataframe to the path
    italy_typo_corrected_df.to_csv(output_path_italy_typo_corrected, index=False)