# Pipeline to manage anomalies processing

Before using any dataset, it is necessary to clean the data to obtain a clear, formatted, and complete dataset. This tedious step is the first step in data analysis.

In this notebook, there are building blocks (functions) that can be used later to perform the identified preprocessing steps.

- Remove missing values
- Remove duplicates
- Remove special characters
- Convert numbers to letters
- Identify the language of the review and translate it if necessary
- Correct spelling errors

In [14]:
import polars as pl
import re

## Remove missing values

In [2]:
def clean_missing_values(df: pl.DataFrame, column_name: str) -> pl.DataFrame:
    """
    Remove rows from a DataFrame where the specified column has missing values,
    and return the cleaned DataFrame.
    
    Args:
        df (pl.DataFrame): The DataFrame to clean.
        column_name (str): Column to check for missing values.
        
    Returns:
        pl.DataFrame: Cleaned DataFrame.
    """
    # Drop rows where the specified column is null
    df_clean = df.drop_nulls(subset=[column_name])
    
    return df_clean

In [3]:
# Example of use
df = pl.read_csv("../data/processed/all_reviews.csv")
df_clean_missing_values = clean_missing_values(df, "review")
print(df.shape)
print(df_clean_missing_values.shape)

(3363667, 5)
(2835129, 5)


## Remove duplicates

In [8]:
def remove_duplicates(df: pl.DataFrame, subset_columns: list) -> pl.DataFrame:
    """
    Remove duplicate rows from a DataFrame based on specified columns,
    and return the cleaned DataFrame.
    
    Args:
        df (pl.DataFrame): The DataFrame to clean.
        subset_columns (list): List of columns to consider for duplicates.
        
    Returns:
        pl.DataFrame: DataFrame with duplicates removed.
    """
    # Drop duplicates based on the subset of columns
    df_clean = df.unique(subset=subset_columns)
    
    return df_clean

In [9]:
# Example of use
df_clean_duplicates = remove_duplicates(df_clean_missing_values, "review")
print(df_clean_missing_values.shape)
print(df_clean_duplicates.shape)

(2835129, 5)
(2811720, 5)


## Remove spacial characters

In [15]:
def remove_special_characters(df: pl.DataFrame, column_name: str, keep: str = "") -> pl.DataFrame:
    """
    Remove special characters from a specified text column using regex.

    Args:
        df (pl.DataFrame): Input Polars DataFrame.
        column_name (str): Name of the text column to clean.
        keep (str): Optional string of characters to preserve (e.g., ".," to keep dots and commas).

    Returns:
        pl.DataFrame: New DataFrame with cleaned text in the specified column.
    """
    # Build regex dynamically: allow alphanumeric, space, underscore, and chosen extra characters
    pattern = rf"[^\w\s{re.escape(keep)}]"

    def clean_text(text: str) -> str:
        if not isinstance(text, str) or not text.strip():
            return text  # ignore empty or non-string
        return re.sub(pattern, "", text)

    df_cleaned = df.with_columns(
        pl.col(column_name).map_elements(clean_text).alias(column_name)
    )

    return df_cleaned

In [21]:
df_no_special_character = remove_special_characters(df_clean_duplicates, "review", ".,!?")
print(df_no_special_character.head())

shape: (5, 5)
┌───────────┬────────────────────────────┬───────────────────────────┬─────────────┬───────────────┐
│ id_review ┆ review                     ┆ original_dataset          ┆ original_id ┆ service_type  │
│ ---       ┆ ---                        ┆ ---                       ┆ ---         ┆ ---           │
│ i64       ┆ str                        ┆ str                       ┆ i64         ┆ str           │
╞═══════════╪════════════════════════════╪═══════════════════════════╪═════════════╪═══════════════╡
│ 671744    ┆ We are still currently     ┆ data_tripadvisor_hotel_re ┆ 73068024    ┆ accommodation │
│           ┆ finishi…                   ┆ views                     ┆             ┆               │
│ 768293    ┆ Abbiamo soggiornato per 3  ┆ data_tripadvisor_hotel_re ┆ 117117861   ┆ accommodation │
│           ┆ gior…                      ┆ views                     ┆             ┆               │
│ 542541    ┆ After the glowing report I ┆ data_tripadvisor_hotel_re ┆ 232486

# Convert numbers to letters

In [22]:
from num2words import num2words

def numbers_to_words(df: pl.DataFrame, column_name: str) -> pl.DataFrame:
    """
    Convert all numbers in a text column into words using num2words.

    Args:
        df (pl.DataFrame): Input DataFrame.
        column_name (str): Name of the text column to process.
        lang (str): Language code (e.g., 'en' or 'fr').

    Returns:
        pl.DataFrame: New DataFrame with numbers replaced by words.
    """
    def convert_numbers(text: str) -> str:
        if not isinstance(text, str) or not text.strip():
            return text
        # Replace every number with its text version
        return re.sub(r'\b\d+\b', lambda m: num2words(int(m.group())), text)

    df_converted = df.with_columns(
        pl.col(column_name).map_elements(convert_numbers).alias(column_name)
    )

    return df_converted

In [26]:
# Example of use
df_without_number = numbers_to_words(df_no_special_character, "review")
print(df_without_number.head())

shape: (5, 5)
┌───────────┬────────────────────────────┬───────────────────────────┬─────────────┬───────────────┐
│ id_review ┆ review                     ┆ original_dataset          ┆ original_id ┆ service_type  │
│ ---       ┆ ---                        ┆ ---                       ┆ ---         ┆ ---           │
│ i64       ┆ str                        ┆ str                       ┆ i64         ┆ str           │
╞═══════════╪════════════════════════════╪═══════════════════════════╪═════════════╪═══════════════╡
│ 671744    ┆ We are still currently     ┆ data_tripadvisor_hotel_re ┆ 73068024    ┆ accommodation │
│           ┆ finishi…                   ┆ views                     ┆             ┆               │
│ 768293    ┆ Abbiamo soggiornato per    ┆ data_tripadvisor_hotel_re ┆ 117117861   ┆ accommodation │
│           ┆ three …                    ┆ views                     ┆             ┆               │
│ 542541    ┆ After the glowing report I ┆ data_tripadvisor_hotel_re ┆ 232486

## Correction of spelling error

### Tests/ Expertimentation

Two methods with two libraries are available. Spellchecker seems to perform slightly better.

In [27]:
from textblob import TextBlob

def textblob_correct(text):
    blob = TextBlob(text)
    return str(blob.correct())

print(textblob_correct("I liek eat appels"))

I like eat appeals


In [4]:
from spellchecker import SpellChecker

spelling = SpellChecker()

def spelling_checks(text):
    correct_result = []
    typo_words = spelling.unknown(text.split())
    for word in text.split():
        if word in typo_words:
            correct_result.append(spelling.correction(word))
        else:
            correct_result.append(word)
    return " ".join(correct_result)

print(spelling_checks("I liek eat appels"))

I like eat apples


### Generalization (test/ ne fonctionne pas encore)

In [5]:
def correct_spelling(df: pl.DataFrame, column_name: str) -> pl.DataFrame:
    """
    Apply spell correction on a specified text column of a Polars DataFrame.

    Args:
        df (pl.DataFrame): Input DataFrame.
        column_name (str): Name of the column containing text to correct.

    Returns:
        pl.DataFrame: New DataFrame with corrected text in the specified column.
    """
    spell = SpellChecker()

    def fix_text(text: str) -> str:
        if not isinstance(text, str) or not text.strip():
            return text  # ignore empty or non-string values

        words = text.split()
        typo_words = spell.unknown(words)
        corrected = [
            spell.correction(w) if w in typo_words else w
            for w in words
        ]
        return " ".join(corrected)

    # Apply the correction to each row of the target column
    df_corrected = df.with_columns(
        pl.col(column_name).map_elements(fix_text).alias(column_name)
    )

    return df_corrected

In [25]:
import polars as pl
from spellchecker import SpellChecker
from tqdm import tqdm  # progress bar

def correct_spelling_optimized(df: pl.DataFrame, column_name: str, batch_size: int = 10000) -> pl.DataFrame:
    """
    Apply fast spell correction on a text column of a large Polars DataFrame.
    Uses caching and batch processing for performance.

    Args:
        df (pl.DataFrame): Input Polars DataFrame.
        column_name (str): Column containing text to correct.
        batch_size (int): Number of rows to process per batch (default=10,000).

    Returns:
        pl.DataFrame: DataFrame with corrected text in the specified column.
    """
    spell = SpellChecker()
    cache = {}

    def fix_word(word: str) -> str:
        """Return cached or corrected version of a single word."""
        if word in cache:
            return cache[word]
        if word in spell:
            cache[word] = word
        else:
            cache[word] = spell.correction(word)
        return cache[word]

    def fix_text(text: str) -> str:
        """Apply correction to an entire review."""
        if not isinstance(text, str) or not text.strip():
            return text
        words = text.split()
        return " ".join(fix_word(w) for w in words)

    # Batch processing
    corrected_reviews = []
    num_rows = len(df)

    for i in tqdm(range(0, num_rows, batch_size), desc="Spell-checking in batches"):
        batch = df.slice(i, batch_size)
        corrected_batch = [fix_text(t) for t in batch[column_name]]
        corrected_reviews.extend(corrected_batch)

    # Return new DataFrame with corrected text
    df_corrected = df.with_columns(
        pl.Series(name=column_name, values=corrected_reviews)
    )

    return df_corrected


In [27]:
# Example of use
df_correct_spelling = correct_spelling_optimized(df_without_number.head(10), "review")
print(df_correct_spelling.head())

Spell-checking in batches:   0%|          | 0/1 [00:01<?, ?it/s]


TypeError: sequence item 25: expected str instance, NoneType found