# Pipeline preprocessing test

Once you have managed the anomalies and created a clean dataset, you now need to create a pipeline that allows you to extract three datasets based on content from a total dataset:
- pets dataset
- children dataset
- disability dataset

To do this, several steps must be carried out:
- stop word removal
- tokenize the text
- lemmatize the text
- extract keywords

In [85]:
import nltk
import polars as pl
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import spacy
from tqdm import tqdm
from typing import Dict, List
import concurrent.futures

In [86]:
# Global Variables

NUM_THREADS = 4

## Stop word removal and tokenization

In [None]:


nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)

NEGATIONS = {"not", "no", "never", "none", "cannot", "can't", "don't", 
             "doesn't", "isn't", "wasn't", "weren't", "wouldn't", "shouldn't", "couldn't"}

def remove_stopwords(df: pl.DataFrame, column_name: str) -> pl.DataFrame:
    stop_words = set(stopwords.words("english"))

    def clean_text(text: str) -> str:
        if not isinstance(text, str):
            return text
        tokens = word_tokenize(text.lower())
        # Garde les mots avec lettres ou tirets et conserve les négations
        filtered = [word for word in tokens if re.match(r"^[A-Za-z-]+$", word) 
                    and (word not in stop_words or word in NEGATIONS)]
        return " ".join(filtered)

    return df.with_columns(
        pl.col(column_name).map_elements(clean_text, return_dtype=pl.Utf8).alias(column_name)
    )


## Lemmatization

In [88]:
import spacy
import polars as pl
from typing import List, Dict
from tqdm import tqdm

# Load SpaCy (disable unnecessary components for faster performance)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def lemmatize_and_clean_texts(
    texts: List[str],
    batch_size: int = 2000,
    n_process: int = 4
) -> List[str]:
    """
    Lemmatize a list of texts using spaCy with multiprocessing
    and clean them for keyword matching.
    
    Cleaning:
    - Replace " - " with "-" to handle multi-word keywords like "pet-friendly".
    - Strip leading/trailing whitespace.
    """
    clean_texts = [(t if isinstance(t, str) else "") for t in texts]
    lemmatized = []
    for doc in nlp.pipe(clean_texts, batch_size=batch_size, n_process=n_process):
        text = " ".join([token.lemma_ for token in doc])
        text = text.replace(" - ", "-").strip()
        lemmatized.append(text)
    return lemmatized


def lemmatize_column_fast(
    df: pl.DataFrame, 
    col_name: str, 
    new_col_name: str = None, 
    chunk_size: int = 5000, 
    n_process: int = 4
) -> pl.DataFrame:
    new_col_name = new_col_name or f"{col_name}_lemmatized"
    texts = df.select(col_name).to_series().to_list()
    lemmatized_chunks = []

    for i in tqdm(range(0, len(texts), chunk_size), desc=f"Lemmatizing {col_name}"):
        chunk = texts[i:i + chunk_size]
        lemmatized_chunks.extend(lemmatize_and_clean_texts(chunk, n_process=n_process))

    return df.with_columns(pl.Series(name=new_col_name, values=lemmatized_chunks))


def lemmatize_categories(
    categories: Dict[str, List[str]]
) -> Dict[str, List[str]]:
    """
    Lemmatize all keywords in category dictionary and clean them
    using lemmatize_and_clean_texts.
    """
    return {
        category: lemmatize_and_clean_texts(keywords, batch_size=100, n_process=1)
        for category, keywords in categories.items()
    }


## Keywords extraction

In [89]:
import re
import concurrent.futures
from typing import Dict, List
import polars as pl
from tqdm import tqdm

def extract_all_categories(
    df: pl.DataFrame,
    col_name: str,
    categories: Dict[str, List[str]],
    exclusions: Dict[str, List[str]] = None,
    n_process: int = 4,
    id_col: str = "id"
) -> pl.DataFrame:
    """
    Extract reviews matching category keywords, keeping reviews if at least one keyword remains
    after temporarily removing exclusion phrases.

    Args:
        df (pl.DataFrame): Input dataframe.
        col_name (str): Column containing the text.
        categories (Dict[str, List[str]]): {category: [lemmatized keywords]}.
        exclusions (Dict[str, List[str]]): {category: [phrases to exclude]}.
        n_process (int): Number of threads.
        id_col (str): Column containing unique IDs.

    Returns:
        pl.DataFrame: DataFrame with columns [id, review, keywords_found, category].
    """

    texts = df.select([id_col, col_name]).to_pandas()
    exclusions = exclusions or {}

    def normalize_keyword(kw: str) -> str:
        return kw.strip().replace(" - ", "-")

    def make_regex(kw: str) -> str:
        kw = normalize_keyword(kw)
        if " " in kw or "-" in kw:
            return re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+")
        else:
            return r"\b" + re.escape(kw) + r"\b"

    def process_category(category: str, keywords: List[str], excluded_phrases: List[str]):
        results = []
        for _, row in texts.iterrows():
            text = row[col_name]
            review_id = row[id_col]
            if not isinstance(text, str):
                continue

            temp_text = text
            # Supprimer temporairement les phrases d'exclusion
            if excluded_phrases:
                for ex in excluded_phrases:
                    temp_text = re.sub(make_regex(ex), " ", temp_text, flags=re.IGNORECASE)

            # Vérifier si au moins un mot clé reste
            matched_keywords = [kw for kw in keywords if re.search(make_regex(kw), temp_text, flags=re.IGNORECASE)]

            if matched_keywords:
                results.append((review_id, text, ", ".join(matched_keywords), category))

        return results

    # Parallel processing
    all_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_process) as executor:
        futures = {executor.submit(process_category, cat, kws, exclusions.get(cat, [])): cat
                   for cat, kws in categories.items()}
        for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Keyword extraction"):
            all_results.extend(fut.result())

    if not all_results:
        return pl.DataFrame(schema={
            id_col: pl.Int64,
            "review": pl.Utf8,
            "keywords_found": pl.Utf8,
            "category": pl.Utf8
        })

    df_filtered = pl.DataFrame({
        id_col: [r[0] for r in all_results],
        "review": [r[1] for r in all_results],
        "keywords_found": [r[2] for r in all_results],
        "category": [r[3] for r in all_results]
    })

    print(f"Extracted {df_filtered.shape[0]} matching reviews across {len(categories)} categories.")
    return df_filtered


## Main

In [90]:
import polars as pl

data = [
    # PET - vrais positifs
    "I love traveling with my dog",
    "I really like travel with my cat and my child",
    "Our cat always comes with us on trips",
    "Pet-friendly hotels make our vacation easier",
    "We brought our hamster along",
    "Birds are not allowed, but our parrot joined",
    "The hot dog was amazing but my dog was very tired so we decided to go home",
    
    # PET - faux positifs à exclure
    "I ate a hot dog at the festival",
    "Investing time in my pet project is fun",
    "The doctor recommended a PET scan",
    "The patient had a cat scan yesterday",
    "Dogecoin prices are rising fast",
    
    # CHILD - vrais positifs
    "We need a crib for our baby",
    "Childcare services at the hotel are great",
    "Kids menu available at the restaurant",
    "High chair provided in our room",
    "Playground nearby is perfect for toddlers",
    
    # CHILD - faux positifs à exclure
    "I remember my childhood fondly",
    "Use the child lock on the door",
    "Childproof cabinets are essential",
    "Childhood memories last forever",
    "Child's play area is empty",
    
    # HANDICAP - vrais positifs
    "Wheelchair access is very important",
    "Elevator and ramps help disabled travelers",
    "Accessible toilet in the lobby",
    "Hearing aid support available",
    "Visual impairment guide for tourists",
    
    # HANDICAP - faux positifs à exclure
    "Handicap parking spots are limited",
    "Join the handicap sports club",
    "Handicap insurance coverage is sufficient",
    "Disabled access forum was informative",
    "Handicap rating system is confusing",
    " Sport and weight lift"
]

df_test = pl.DataFrame({
    "id": range(1, len(data) + 1),
    "review": data
})

print(df_test)


shape: (33, 2)
┌─────┬─────────────────────────────────┐
│ id  ┆ review                          │
│ --- ┆ ---                             │
│ i64 ┆ str                             │
╞═════╪═════════════════════════════════╡
│ 1   ┆ I love traveling with my dog    │
│ 2   ┆ I really like travel with my c… │
│ 3   ┆ Our cat always comes with us o… │
│ 4   ┆ Pet-friendly hotels make our v… │
│ 5   ┆ We brought our hamster along    │
│ …   ┆ …                               │
│ 29  ┆ Join the handicap sports club   │
│ 30  ┆ Handicap insurance coverage is… │
│ 31  ┆ Disabled access forum was info… │
│ 32  ┆ Handicap rating system is conf… │
│ 33  ┆  Sport and weight lift          │
└─────┴─────────────────────────────────┘


In [91]:
if __name__ == "__main__":

    df = pl.read_csv("../../data/original/dataset/data_booking.csv").head(5000)
    nb_process = NUM_THREADS
    name_column = "review"
    output_path = "../../data/processed/booking_test_key_word_test_a_supprimer.csv"

    categories = {
        "handicap": [
            "handicap", "wheelchair", "accessible", "braille", "ramp", "lift", "elevator",
            "disabled", "barrier-free", "accessible toilet", "toilet accessible",
            "mobility aid", "adapted", "hearing aid", "visual impairment", "accessible entrance"
        ],
        "pet": [
            "dog", "cat", "pet", "animal", "rabbit", "hamster", "ferret", "bird",
            "pet-friendly", "animals allowed", "dog-friendly", "cat-friendly",
            "pet welcome", "pup", "dog bowl"
        ],
        "child": [
            "child", "baby", "kid", "stroller", "son", "daughter", "toddler",
            "infant", "playground", "high chair", "changing table", "family-friendly",
            "childcare", "kids menu", "baby seat", "family","baby bed", "cot", "crib"
        ]
    }

    exclusions = {
        "pet": [
            "hot dog",          
            "dogecoin",         
            "corn dog",
            "dog day",          
            "pet project",      
            "cat scan", 
            "pet scan",               
        ],
        "child": [
            "childhood",
        ],
        "handicap": [
            "weight lift",      
        ]
    }


    df_clean = remove_stopwords(df, name_column)

    lemmatized_categories = lemmatize_categories(categories)
    print("\n=== Lemmatized Categories ===")

    lemmatized_exclusions = lemmatize_categories(exclusions)
    print("\n=== Lemmatized Exclusions ===")

    df_lem = lemmatize_column_fast(df_clean, name_column, n_process=4)
    print("\n=== DataFrame with Lemmatized Texts ===")

    df_keywords = extract_all_categories(
        df_lem, 
        col_name = f"{name_column}_lemmatized",
        categories=lemmatized_categories,
        exclusions=lemmatized_exclusions,
        n_process=nb_process
    )
         
    print("\n=== Filtered Reviews ===")
    print(df_keywords.head(10))

    # Sauvegarde
    df_keywords.write_csv(output_path)
    print(f"DataFrame saved to {output_path}")



=== Lemmatized Categories ===

=== Lemmatized Exclusions ===


Lemmatizing review: 100%|██████████| 1/1 [00:23<00:00, 23.62s/it]



=== DataFrame with Lemmatized Texts ===


Keyword extraction: 100%|██████████| 3/3 [00:01<00:00,  2.41it/s]

Extracted 431 matching reviews across 3 categories.

=== Filtered Reviews ===
shape: (10, 4)
┌─────┬─────────────────────────────────┬────────────────┬──────────┐
│ id  ┆ review                          ┆ keywords_found ┆ category │
│ --- ┆ ---                             ┆ ---            ┆ ---      │
│ i64 ┆ str                             ┆ str            ┆ str      │
╞═════╪═════════════════════════════════╪════════════════╪══════════╡
│ 59  ┆ location good accessible many … ┆ accessible     ┆ handicap │
│ 158 ┆ room superb enjoy living room … ┆ elevator       ┆ handicap │
│ 170 ┆ conveniently locate beautiful … ┆ accessible     ┆ handicap │
│ 181 ┆ excellent location quiet still… ┆ elevator       ┆ handicap │
│ 226 ┆ excellent location bar open am… ┆ lift           ┆ handicap │
│ 284 ┆ clean compact double bed room … ┆ accessible     ┆ handicap │
│ 332 ┆ location cute design theme roo… ┆ lift           ┆ handicap │
│ 337 ┆ apartment close gold coast con… ┆ elevator       ┆ handicap




In [92]:
print(lemmatized_categories)

{'handicap': ['handicap', 'wheelchair', 'accessible', 'braille', 'ramp', 'lift', 'elevator', 'disable', 'barrier-free', 'accessible toilet', 'toilet accessible', 'mobility aid', 'adapt', 'hear aid', 'visual impairment', 'accessible entrance'], 'pet': ['dog', 'cat', 'pet', 'animal', 'rabbit', 'hamster', 'ferret', 'bird', 'pet-friendly', 'animal allow', 'dog-friendly', 'cat-friendly', 'pet welcome', 'pup', 'dog bowl'], 'child': ['child', 'baby', 'kid', 'stroller', 'son', 'daughter', 'toddler', 'infant', 'playground', 'high chair', 'change table', 'family-friendly', 'childcare', 'kids menu', 'baby seat', 'family', 'baby bed', 'cot', 'crib']}


In [93]:
print(df_lem)

shape: (5_000, 18)
┌────────────┬────────────┬────────────┬────────────┬───┬───────────┬──────┬───────────┬───────────┐
│ review_tit ┆ review_sco ┆ review_hel ┆ guest_type ┆ … ┆ location_ ┆ id   ┆ review    ┆ review_le │
│ le         ┆ re         ┆ pful_votes ┆ ---        ┆   ┆ is_city_c ┆ ---  ┆ ---       ┆ mmatized  │
│ ---        ┆ ---        ┆ ---        ┆ str        ┆   ┆ enter     ┆ i64  ┆ str       ┆ ---       │
│ str        ┆ f64        ┆ i64        ┆            ┆   ┆ ---       ┆      ┆           ┆ str       │
│            ┆            ┆            ┆            ┆   ┆ i64       ┆      ┆           ┆           │
╞════════════╪════════════╪════════════╪════════════╪═══╪═══════════╪══════╪═══════════╪═══════════╡
│ null       ┆ 7.0        ┆ 0          ┆ Couple     ┆ … ┆ 1         ┆ 1    ┆ beautiful ┆ beautiful │
│            ┆            ┆            ┆            ┆   ┆           ┆      ┆ bathroom  ┆ bathroom  │
│            ┆            ┆            ┆            ┆   ┆           ┆   

In [94]:
print(df_clean.head(15).to_pandas())

                                         review_title  review_score  \
0                                                None           7.0   
1                     Fantastic location and  service           8.0   
2                              Great central location           7.0   
3                     Great place in a great location           8.0   
4                                      Fantastic find          10.0   
5                                                None           8.0   
6                                                None           7.0   
7                                                None          10.0   
8                                        Perfect stay          10.0   
9                              Annual happy vacation.           9.0   
10  Would definitely recommend for families. And w...          10.0   
11                                               None          10.0   
12   Book this apartment in Palermo:)It is worth it:)          10.0   
13  Gr