# Pipeline preprocessing

Once you have managed the anomalies and created a clean dataset, you now need to create a pipeline that allows you to extract three datasets based on content from a total dataset:
- pets dataset
- children dataset
- disability dataset

To do this, several steps must be carried out:
- stop word removal
- tokenize the text
- lemmatize the text
- extract keywords

In [1]:
import nltk
import polars as pl
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import spacy
from tqdm import tqdm
from typing import Dict, List

In [2]:
df = pl.read_csv("../data/processed/all_reviews.csv")

## Stop word removal andd tokenization

In [3]:
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)

def remove_stopwords(df: pl.DataFrame, column_name: str) -> pl.DataFrame:
    """
    Remove English stopwords from a given text column in a Polars DataFrame.

    Parameters
    ----------
    df : pl.DataFrame
        Input DataFrame containing text data.
    column_name : str
        Name of the column containing the text to process.

    Returns
    -------
    pl.DataFrame
        A new DataFrame with stopwords removed from the specified column.
    """
    stop_words = set(stopwords.words("english"))

    def clean_text(text: str) -> str:
        if not isinstance(text, str):
            return text
        tokens = word_tokenize(text.lower())
        filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
        return " ".join(filtered)

    return df.with_columns(
        pl.col(column_name).map_elements(clean_text, return_dtype=pl.Utf8).alias(column_name)
    )


In [4]:
## Example of use
#df_clean = remove_stopwords(df, "review")

## Lemmatization

In [5]:
# Charger SpaCy (désactiver les composants inutiles pour plus de vitesse)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def lemmatize_texts(
    texts: List[str], 
    batch_size: int = 1000, 
    n_process: int = 4
) -> List[str]:
    """
    Lemmatize a list of texts using spaCy with multiprocessing.
    """
    clean_texts = [(t if isinstance(t, str) else "") for t in texts]
    lemmatized = []
    for doc in nlp.pipe(clean_texts, batch_size=batch_size, n_process=n_process):
        lemmatized.append(" ".join([token.lemma_ for token in doc]))
    return lemmatized


def lemmatize_column_fast(
    df: pl.DataFrame, 
    col_name: str, 
    new_col_name: str = None, 
    chunk_size: int = 5000, 
    n_process: int = 4
) -> pl.DataFrame:
    """
    Lemmatize a Polars DataFrame column efficiently in batches with multiprocessing.
    """
    new_col_name = new_col_name or f"{col_name}_lemmatized"
    texts = df.select(col_name).to_series().to_list()
    lemmatized_chunks = []

    for i in tqdm(range(0, len(texts), chunk_size), desc=f"Lemmatizing {col_name}"):
        chunk = texts[i:i + chunk_size]
        lemmatized_chunks.extend(lemmatize_texts(chunk, n_process=n_process))

    return df.with_columns(pl.Series(name=new_col_name, values=lemmatized_chunks))


def lemmatize_categories(
    categories: Dict[str, List[str]]
) -> Dict[str, List[str]]:
    """
    Lemmatize all keywords in category dictionary.
    """
    return {
        category: lemmatize_texts(keywords, batch_size=100, n_process=1)
        for category, keywords in categories.items()
    }

## Keywords extraction

In [None]:
import re
import concurrent.futures
from tqdm import tqdm
import polars as pl
from typing import Dict, List


def extract_all_categories(
    df: pl.DataFrame,
    col_name: str,
    categories: Dict[str, List[str]],
    n_process: int = 4
) -> pl.DataFrame:
    """
    Extract reviews matching any category keywords using regex filtering.
    Runs extraction for all categories in parallel.
    
    Parameters
    ----------
    df : pl.DataFrame
        The input dataframe.
    col_name : str
        Column name containing the (lemmatized) text.
    categories : dict[str, list[str]]
        Dictionary of category → list of keywords.
    n_process : int
        Number of threads to use.
    
    Returns
    -------
    pl.DataFrame
        DataFrame with columns ["text", "keywords_found", "category"].
    """

    texts = df.select(col_name).to_series().to_list()

    # --- Fonction interne pour traiter UNE catégorie ---
    def process_category(category, keywords):
        regex = r"\b(" + "|".join(
            re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+") 
            for kw in keywords
        ) + r")\b"

        results = []
        for text in texts:
            if not isinstance(text, str):
                continue
            if re.search(regex, text, flags=re.IGNORECASE):
                found = [
                    kw for kw in keywords 
                    if re.search(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b", text, flags=re.IGNORECASE)
                ]
                results.append((text, ", ".join(found), category))
        return results

    # --- Lancer en parallèle sur toutes les catégories ---
    all_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_process) as executor:
        futures = {
            executor.submit(process_category, cat, kws): cat
            for cat, kws in categories.items()
        }
        for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Keyword extraction"):
            cat_results = fut.result()
            all_results.extend(cat_results)

    if not all_results:
        print("No matches found for any category.")
        return pl.DataFrame(schema={"text": pl.Utf8, "keywords_found": pl.Utf8, "category": pl.Utf8})

    # --- Convertir en DataFrame Polars ---
    df_filtered = pl.DataFrame(
        {
            "text": [r[0] for r in all_results],
            "keywords_found": [r[1] for r in all_results],
            "category": [r[2] for r in all_results],
        }
    )

    print(f"Extracted {df_filtered.shape[0]} matching reviews across {len(categories)} categories.")
    return df_filtered


## Main

In [15]:
if __name__ == "__main__":

    df = pl.read_csv("../data/processed/all_reviews.csv")
    df = df.head(1000)
    nb_process = 4
    name_column = "review"

    categories = {
        "handicap": [
            "handicap", "wheelchair", "accessible", "braille", "ramp", "lift", "elevator",
            "disabled", "barrier-free", "accessible toilet", "toilet accessible",
            "mobility aid", "adapted", "hearing aid", "visual impairment", "accessible entrance"
        ],
        "pets": [
            "dog", "cat", "pet", "animal", "rabbit", "hamster", "ferret", "bird",
            "pet-friendly", "animals allowed", "dog-friendly", "cat-friendly",
            "pet welcome", "pup", "dog bowl"
        ],
        "children": [
            "child", "baby", "kid", "stroller", "son", "daughter", "toddler",
            "infant", "playground", "high chair", "changing table", "family-friendly",
            "childcare", "kids menu", "baby seat", "family","baby bed", "cot", "crib"
        ]
    }

    df_clean = remove_stopwords(df, name_column)

    lemmatized_categories = lemmatize_categories(categories)
    print("\n=== Lemmatized Categories ===")

    df_lem = lemmatize_column_fast(df_clean, name_column, n_process=nb_process)
    print("\n=== DataFrame with Lemmatized Texts ===")

    df_keywords = extract_all_categories(
        df_lem, 
        col_name = f"{name_column}_lemmatized",
        categories=lemmatized_categories,
        n_process=nb_process  
    )

    print("\n=== Filtered Reviews ===")
    print(df_keywords)



=== Lemmatized Categories ===


Lemmatizing review: 100%|██████████| 1/1 [00:36<00:00, 36.00s/it]



=== DataFrame with Lemmatized Texts ===


Keyword extraction: 100%|██████████| 3/3 [00:00<00:00, 1389.46it/s]


✅ Extracted 116 matching reviews across 3 categories.

=== Filtered Reviews ===
shape: (116, 3)
┌─────────────────────────────────┬────────────────┬──────────┐
│ text                            ┆ keywords_found ┆ category │
│ ---                             ┆ ---            ┆ ---      │
│ str                             ┆ str            ┆ str      │
╞═════════════════════════════════╪════════════════╪══════════╡
│ walk dog go pueta vallarta rea… ┆ dog            ┆ pets     │
│ awful buy year old birthday th… ┆ animal         ┆ pets     │
│ loud loud loud take listen buy… ┆ dog            ┆ pets     │
│ drink hot chocolate watch snow… ┆ dog            ┆ pets     │
│ gentle leader great product fi… ┆ dog            ┆ pets     │
│ …                               ┆ …              ┆ …        │
│ eat pizza ice cream family      ┆ family         ┆ children │
│ lid fold back way lid lift imm… ┆ lift           ┆ handicap │
│ soccer ride bike weight lift    ┆ lift           ┆ handicap │
│ still 