# 1. Typo correction


## 1.1 Import the required libraries


In [1]:
from transformers import pipeline
from langdetect import detect_langs, DetectorFactory
from textblob import TextBlob
import pandas as pd
import spacy
from tqdm import tqdm
tqdm.pandas()

nlp = spacy.load('en_core_web_sm') 
language_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection") # this model is 1.1 gigabyte so it will take around 5 mins to download it

DetectorFactory.seed = 0




## 1.2 Load in the raw data

### Base Data

In [None]:
# Determine the location of the dataframe containing the translated text
base_tilt_data_location = "../../data/example_data/input/products_and_services_unprocessed.csv"

# use raw_df
base_tilt_data = pd.read_csv(base_tilt_data_location)

# Display the dataframe
display(base_tilt_data)

Unnamed: 0,products_id,products_and_services
0,164399edbf8e880dc2e856f50d51e720bd0a8abe,"fish, frozen and deep-frozen"
1,b0d3c55743b1b858ec2843c8870116bb8af543fd,drilling and test boring - equipment
2,b14c038972e6a52bfbf3ffbe77def57a62c5b9cf,well-management services
3,abadc2542b4b5c1ecfe41c22afb2347b1d9b65af,electronic data processing - software
4,60c58ad2ef34d96fae028f1039fab03dec9eb9a2,communication
...,...,...
32058,a56bfdd9971ddba76de33e5dd394faab63d2c58c,trading in non-ferrous products
32059,d16685f9db86a7e446d5a4c763a17016ffdfa613,precision weights for scales
32060,37c8e6d302d907a76f49d45a91949c86dd5fcc03,weights and masses - measurement and verificat...
32061,4aa756effa61af41058cf80f475a03b439232cfe,manicure scissors


### tilt Italy Data

In [2]:
# Determine the location of the dataframe containing the translated text
italy_tilt_data_location = "../../data/example_data/input/tilt_italy_products_and_services_unprocessed.csv"

# use raw_df
italy_tilt_data = pd.read_csv(italy_tilt_data_location)

# Display the dataframe
display(italy_tilt_data)

Unnamed: 0,products_and_services,products_id
0,organic farming,996c2256-502c-4821-bd87-b822405a10eb
1,crop services,c35b0904-4747-4038-a0c7-d391af1f944e
2,agriculture - machines & equipment,941c038a-3547-4985-835d-6c82a56090f8
3,agricultural systems and equipment,98d6e96c-3d4f-4d33-940e-3444d9f725c8
4,organic fertilizers,546529b7-9e7b-4b9e-92d7-0c532ef6260f
...,...,...
14572,decorative paints,d87ded84-6b5d-4ef1-9cd0-5b1f4ea26910
14573,"wall-coverings, fabric",9c4c22f5-6c84-402e-91d3-b317896e8650
14574,"wall-coverings, fabric",a3664d9f-c777-4193-8707-417b633d9a4e
14575,carpet,f78e7f1a-8fe1-4346-b487-a54d9fa94a38


## 1.3 Apply typo correction module

### Helper functions

In [4]:
def conf_ld_detect_language(text, model="def"):
    """Language detection wrapper.
    
    Returns detected language (ISO-code) and confidence of detection. In case of 
    failure of detection string 'ident_fail' and a pd.NA value for confidence is 
    returned.
    
    Args:
        text (str): The string for which language shall be detected.
        model (str): The model to be used for language detection. Defaults to langdetect model.
    Returns:
        str: The detected language (ISO-code).
    """
    try:
        if model == "def":
            highest_conf = detect_langs(text)[0]
            return highest_conf.lang
        else:
            result = language_detector(text)[0]
            return str(result["label"])
    except:   
        return "ident_fail", pd.NA

In [5]:
def typo_correction(text=""):
    """Typo correction wrapper.
    
    Returns corrected text. In case of failure of correction the original text 
    is returned. 
    
    Args:
        text (str): The string to be corrected.
    Returns:
        str: The corrected string.
    """
    try:
        # contextualspellcheck only works for english. the same applies to textblob and the huggingface pipeline. pyspellcheck does work with different languages but requires single words as input. (aka splitted)
        return(TextBlob(text).correct().string)
    except:
        return text

### Typo correction module

In [6]:
def typo_correct_df(df):
    """Typo correction wrapper for dataframes.
    
    Returns dataframe with corrected text. In case of failure of correction the 
    original text is returned. 
    
    Args:
        df (pd.DataFrame): The dataframe containing the text to be corrected.
    Returns:
        pd.DataFrame: The dataframe with corrected text.
    """
    #df['ID'] = range(1, len(df) + 1)
    # detect the language of the text
    print("Detecting the languae of the text...")
    df.loc[:, "language (ISO-code)"] = df["products_and_services"].progress_apply(lambda x: conf_ld_detect_language(x))

    # then take subset of english texts
    print("Taking subset of english texts...")
    english_df = df[df["language (ISO-code)"] == "en"]
    # exclude enlgish texts from the original df
    df = df[df["language (ISO-code)"] != "en"]

    # apply typo correction to english texts
    print("Applying typo correction...")
    english_df.loc[:, "typo_corrected"] = english_df["products_and_services"].progress_apply(lambda x: typo_correction(x))

    # merge the corrected english texts with the original df
    print("Merging the corrected english texts with the original df...")
    df = pd.concat([df, english_df], ignore_index=True)
    # replace empty values in typo_corrected with the original text
    df["typo_corrected"].fillna(df["products_and_services"], inplace=True)
    corrected_df = df.copy()
    return corrected_df

### Base Data

In [7]:
base_typo_corrected_df = typo_correct_df(base_tilt_data)

NameError: name 'base_tilt_data' is not defined

### tilt Italy Data

In [8]:
italy_typo_corrected_df = typo_correct_df(italy_tilt_data)

Detecting the languae of the text...


100%|██████████| 14577/14577 [04:17<00:00, 56.61it/s]


Taking subset of english texts...
Applying typo correction...


100%|██████████| 8725/8725 [22:37<00:00,  6.43it/s] 

Merging the corrected english texts with the original df...



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_df["typo_corrected"] = english_df["products_and_services"].progress_apply(lambda x: typo_correction(x))


## 1.4 Export the dataframe with the corrected text 

### Base Data

### tilt Italy Data

In [9]:
# Define the path for the new dataframe
output_path_italy_typo_corrected = "../../data/example_data/output/italy_data/italy_typo_corrected_products.csv"

# Write the new dataframe to the path
italy_typo_corrected_df.to_csv(output_path_italy_typo_corrected)