# 1. Typo correction


## 1.1 Import the required libraries


In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, BooleanType
from lingua import Language, LanguageDetectorBuilder
from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils
from pandarallel import pandarallel
from transformers import pipeline
from textblob import TextBlob
from tqdm import tqdm

import pandas as pd
import numpy as np
import torch
import uuid

# Define a namespace (this can be any UUID)
namespace = uuid.NAMESPACE_DNS

device = True if torch.cuda.is_available() else False
print("GPU availability:{}".format(device))

spark = SparkSession.builder.getOrCreate()
dbutils = DBUtils(spark) # important to have databricks cluster up and running. restart kernel to reflect cluster activity

pandarallel.initialize(progress_bar=True)

# specify the languages to be used for language detection to keep it within the scope of possible languages
languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH, Language.ITALIAN, Language.DUTCH, Language.ARABIC, Language.CHINESE]
language_detector = LanguageDetectorBuilder.from_languages(*languages).build()

typo_corrector = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base", max_length=1000)

GPU availability:False
INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## 1.2 Load in the raw data

In [8]:
schema = StructType([
        StructField('company_name', StringType(), False),
        StructField('group', StringType(), True),
        StructField('sector', StringType(), True),
        StructField('subsector', StringType(), True),
        StructField('main_activity', StringType(), True),
        StructField('address', StringType(), True),
        StructField('company_city', StringType(), True),
        StructField('postcode', StringType(), True),
        StructField('country', StringType(), False),
        StructField('products_and_services', StringType(), True),
        StructField('information', StringType(), True),
        StructField('min_headcount', StringType(), True),
        StructField('max_headcount', StringType(), True),
        StructField('type_of_building_for_registered_address', StringType(), True),
        StructField('verified_by_europages', StringType(), True),
        StructField('year_established', StringType(), True),
        StructField('websites', StringType(), True),
        StructField('download_datetime', StringType(), True),
        StructField('id', StringType(), False),
        StructField('filename', StringType(), False)
])

In [11]:
def load_files():
    if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
        files = [x[0] for x in dbutils.fs.ls("abfss://landingzone@storagetiltdevelop.dfs.core.windows.net/tiltEP/")]
    else:
        files = [x[0] for x in dbutils.fs.ls("abfss:/mnt/indicatorBefore/tiltEP/")]
    # create a list in which the panda dataframes will be stored
    df_list = []
    for i in tqdm(range(len(files))):
        # read every spark dataframe but only get the id and products_and_services column
        df = spark.read.csv(files[i], header=True, schema=schema, sep=";").select("id", "products_and_services").toPandas()
        # add this dataframe to the list of dataframes
        df_list.append(df) 
    # concatenate all the dataframes in the list into one dataframe
    final_df = pd.concat(df_list)
    return final_df

def create_product_id( dataframe):
    # Use uuid5 to generate a UUID based on the product name
    dataframe['products_id'] = dataframe['products_and_services'].apply(lambda name: uuid.uuid5(namespace, name))
    # extract company_id and products_id and export it
    dataframe[["id", "products_id"]].to_csv("../../data/example_data/output/tilt_company_products.csv")
    return dataframe

def split_delimit(dataframe):
    # create a copy of the dataframe
    splitted_df = dataframe.copy()
    # split the products_and_services column into a list of strings
    splitted_df['products_and_services'] = splitted_df['products_and_services'].str.split('|')

    # explode the products_and_services column into multiple rows
    splitted_df = splitted_df.explode('products_and_services')  
    # replace any empty strings with a null value
    splitted_df["products_and_services"] = splitted_df["products_and_services"].replace("", np.nan)

    # drop any rows with a null value in the products_and_services column
    splitted_df.dropna(subset=['products_and_services'], inplace=True)
    splitted_df = splitted_df.reset_index(drop=True)

    # remove whitespaces from products_and_services column
    splitted_df["products_and_services"] = splitted_df["products_and_services"].str.strip()
    splitted_df.drop_duplicates(inplace=True)

    # return the dataframe
    return splitted_df

def explode_products():
    return create_product_id(split_delimit(load_files()))[["products_id", "products_and_services"]].drop_duplicates()

In [12]:
x = explode_products()

100%|██████████| 1337/1337 [10:02<00:00,  2.22it/s] 


### Helper functions

In [13]:
def conf_ld_detect_language(text):
    """Language detection wrapper.
    
    Returns detected language (ISO-code) and confidence of detection. In case of 
    failure of detection string 'ident_fail' and a pd.NA value for confidence is 
    returned.
    
    Args:
        text (str): The string for which language shall be detected.
        model (str): The model to be used for language detection. Defaults to langdetect model.
    Returns:
        str: The detected language (ISO-code).
    """
    
    try:
        detected_language = language_detector.detect_language_of(text).iso_code_639_1.name.lower()
        return detected_language
    except:   
        return "ident_fail", pd.NA

In [15]:
def typo_correction(text="", model="default"):
    """Typo correction wrapper.
    
    Returns corrected text. In case of failure of correction the original text 
    is returned. 
    
    Args:
        text (str): The string to be corrected.
        model (str): The model to be used for typo correction. Defaults to textblob model.
    Returns:
        str: The corrected string.
    """
    try:
        if model == "default":
            return(TextBlob(text).correct().string)
        elif model == "huggingface":
            return(typo_corrector(text)[0]["generated_text"])
    except:
        return text

### Typo correction module

In [16]:
def typo_correct_df(df):
    """Typo correction wrapper for dataframes.
    
    Returns dataframe with corrected text. In case of failure of correction the 
    original text is returned. 
    
    Args:
        df (pd.DataFrame): The dataframe containing the text to be corrected.
    Returns:
        pd.DataFrame: The dataframe with corrected text.
    """
    # detect the language of the text but only for the rows that do not have a value in the automatic_processed_products_and_services column
    print("Detecting the language of the text...")
    # only take rows that have a True value in the to_process column
    to_process_df = df.copy()
    to_process_df.loc[:, "language (ISO-code)"] = to_process_df["products_and_services"].parallel_apply(conf_ld_detect_language)

    # then take subset of english texts
    print("Taking subset of English texts...")
    english_df = to_process_df[to_process_df["language (ISO-code)"] == "en"]
    # exclude enlgish texts from the original df
    to_process_df = to_process_df[to_process_df["language (ISO-code)"] != "en"]

    # apply typo correction to english texts
    print("Applying typo correction...")
    english_df = english_df.copy()
    english_df.loc[:, "typo_corrected"] = english_df["products_and_services"].parallel_apply(typo_correction)

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...")
    df = pd.concat([to_process_df, english_df], ignore_index=True)
    # replace empty values in typo_corrected with the original text
    df["typo_corrected"].fillna(df["products_and_services"], inplace=True)
    # make typo_corrected lowercase and remove all dots at the end
    df["typo_corrected"] = df["typo_corrected"].str.lower().str.replace("\.$", "")
    return df

### Provided input Data

In [17]:
corrected_df = typo_correct_df(x)

Detecting the language of the text...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5453), Label(value='0 / 5453'))), …

Taking subset of English texts...
Applying typo correction...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4583), Label(value='0 / 4583'))), …

Merging the corrected english texts with the original df...


## 1.4 Export the dataframe with the corrected text 

In [18]:
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
    pass
else:
    # Define the path for the new dataframe
    output_path_typo_corrected = "../../data/example_data/output/tilt_products_typo_corrected.csv"
    # Write the new dataframe to the path
    corrected_df.to_csv(output_path_typo_corrected, index=False)