# Pipeline preprocessing

Once you have managed the anomalies and created a clean dataset, you now need to create a pipeline that allows you to extract three datasets based on content from a total dataset:
- pets dataset
- children dataset
- disability dataset

To do this, several steps must be carried out:
- stop word removal
- tokenize the text
- lemmatize the text
- extract keywords

In [11]:
import nltk
import polars as pl
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import re
import spacy
from tqdm import tqdm
from typing import Dict, List
import concurrent.futures
import os
from dotenv import load_dotenv
import logging
import json

In [12]:
# Simple loger for pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [13]:
# Global variables
load_dotenv(dotenv_path="../.env")
NUM_THREAD = int(os.environ.get("NUM_THREADS"))
logger.info(f"NUM_THREAD fixed to {NUM_THREAD}")

2025-11-20 23:50:17,676 - INFO - NUM_THREAD fixed to 8


In [14]:
nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))
print(stop_words)

{'only', 'under', 'during', 't', "he'll", 'more', 'into', "he'd", 'those', 'be', 'our', "wasn't", 'his', "won't", 'wouldn', 'down', 'their', 'did', "they'll", "you're", 'needn', 'myself', "we'd", 'had', 'why', 'o', 'ain', 'before', 'itself', 'when', 'on', 'not', 'd', 'don', 'were', 'himself', 's', 'have', 'theirs', 'yourselves', 'below', 'haven', 'its', 'an', 'own', "shouldn't", 'your', 'yours', 'was', 'will', "we've", 'isn', 'weren', 'once', "you'd", 'just', "hadn't", 'now', 'other', 'all', "don't", 'from', 'ours', 've', 'above', 'that', 'ma', 'through', 'while', "i've", 'after', 'because', 'too', "weren't", 'which', "he's", 'such', 'wasn', 'being', 'shan', "you'll", 'whom', "it'd", "i'm", 'again', 'hasn', 'further', "they're", 'she', 'over', "doesn't", 'here', 'them', "it's", 'any', 'where', 'until', 'a', 'it', 'with', 'her', 'been', 'hers', 'there', 'doesn', 'm', 'by', 'does', 'nor', "we're", "haven't", 'he', 'if', "i'd", 'or', "needn't", 'very', 'my', 'having', 'most', "they'd", 'm

## Stop word removal and tokenization

In [15]:
nltk.download("stopwords", quiet=True)

# Tokenizer qui ne casse pas les contractions
tokenizer = TweetTokenizer(preserve_case=False)

# Exceptions
exceptions_to_keep = {
    "not","no","nor","aren't","can't","couldn't","didn't","doesn't","don't",
    "hadn't","hasn't","haven't","isn't","mightn't","mustn't","shouldn't",
    "weren't","won't","wouldn't",
    "am","is","are","was","were","have","has","had",
    "i","we","you","he","she","they",
    "i'm","i've","you're","he's","she's","it's","they're","they've",
    "we're","we've","who's","what's","where's","that's","there's",
}
exceptions_to_keep = {w.lower() for w in exceptions_to_keep}


def is_valid_token(token: str) -> bool:
    """Vérify if a token is valid"""
    # tokens parasites comme 's, 're, 'm
    if token.startswith("'"):
        return False

    # alphabetic, contractions, tirets
    return bool(re.match(r"^[a-z]+(?:[-'][a-z]+)*$", token))


def remove_stopwords(df: pl.DataFrame, column_name: str) -> pl.DataFrame:
    stop_words = set(stopwords.words("english"))

    def clean_text(text: str) -> str:
        if not isinstance(text, str):
            return text

        # uniformiser quotes
        text = text.replace("’", "'")

        tokens = tokenizer.tokenize(text)

        filtered = [
            tok for tok in tokens
            if is_valid_token(tok)
            and (tok not in stop_words or tok in exceptions_to_keep)
        ]

        return " ".join(filtered)

    return df.with_columns(
        pl.col(column_name).map_elements(clean_text, return_dtype=pl.Utf8)
    )


def remove_stopwords_categories(dico: Dict[str, List[str]]) -> Dict[str, List[str]]:
    """
    Remove English stopwords from every list of keywords in a dictionary.

    Parameters
    ----------
    dico : Dict[str, List[str]]
        Input dictionary containing lists of keywords.

    Returns
    -------
    Dict[str, List[str]]
        A new dictionary with stopwords removed from each list.
    """

    stop_words = set(stopwords.words("english"))

    def clean_text(text: str) -> str:
        if not isinstance(text, str):
            return text

        text = text.replace("’", "'")

        tokens = tokenizer.tokenize(text)
        
        filtered = [
            tok for tok in tokens
            if is_valid_token(tok)
            and (tok not in stop_words or tok in exceptions_to_keep)
        ]

        return " ".join(filtered)

    new_dico = {}

    for key, lst in dico.items():
        if isinstance(lst, list):
            new_dico[key] = [clean_text(item) for item in lst]
        else:
            new_dico[key] = lst

    return new_dico


In [16]:
df_test = pl.DataFrame({
    "text": [
        # 1 — Stopwords simples
        "I am going to the store and you are coming with me.",
        
        # 2 — Vérification des exclusions (négations)
        "I do not like this movie because it is not good.",
        
        # 3 — Contractions importantes à garder
        "I’m happy because you’re here and that’s wonderful.",
        
        # 4 — Autres contractions + stopwords
        "They’re doing what they’ve planned, and it’s going well.",
        
        # 5 — Stopwords mais verbes à garder (am, is, are…)
        "She is a doctor and he is an engineer but they are tired.",
        
        # 6 — Majuscules et minuscules
        "I Think That You Shouldn’t Do This Because It Isn’t Safe.",
        
        # 7 — Mélange mots valides / invalides (ponctuation)
        "Well, this is — obviously — not what I expected!",
        
        # 8 — Tokens non alphabetiques à retirer
        "I can't believe it, they’re 100% sure it's happening!!!",
        
        # 9 — Phrase courte avec exceptions
        "No, I don’t think so.",
        
        # 10 — Phrase avec mots composés (test du regex A-Za-z-)
        "This state-of-the-art system isn’t working as expected.",
        
        # 11 — Cas limite : chaîne vide
        "",
        
        # 12 — Cas limite : texte non string
        None
    ]
})

# --- Test de la fonction ---
df_cleaned = remove_stopwords(df_test, "text")
# Afficher toutes les lignes
for i, row in enumerate(df_cleaned["text"]):
    print(f"{i}: {row}")

# Load categories
with open("../data/categories.json", "r", encoding="utf-8") as f:
    categories = json.load(f)

# Load exclusions
with open("../data/exclusions.json", "r", encoding="utf-8") as f:
    exclusions = json.load(f)

exclusions_cleaned= remove_stopwords_categories(exclusions)
categories_cleaned = remove_stopwords_categories(categories)

print(exclusions_cleaned)
print(categories_cleaned)

0: i am going store you are coming
1: i not like movie is not good
2: i'm happy you're that's wonderful
3: they're they've planned it's going well
4: she is doctor he is engineer they are tired
5: i think you shouldn't isn't safe
6: well is obviously not i expected
7: i can't believe they're sure it's happening
8: no i don't think
9: state-of-the-art system isn't working expected
10: 
11: None
{'handicap': ['weight lift', 'lift ticket', 'ski lift', 'elevator pitch', 'lift-off', 'lift bridge', 'blind eye', 'window blinds', 'window blind', 'blinds', 'blind date', 'blind mask'], 'pet': ['hot dog', 'hotdog', 'hotdogs', 'dogecoin', 'corn dog', 'dog day', 'pet project', 'cat scan', 'pet scan', 'pet peeve', 'pet hate', 'pet bottle', 'bulldog clip', 'dog-ear', 'dog-iron', 'dog bark', 'dog barking', 'dogs barking', 'bird displays', 'early bird', 'early birds', 'party animal', 'screech birds', 'screech bird', 'bird-eye', 'bird bbq', 'half bird', 'chicken bird', 'street dog', 'data dog', 'sneak d

## Lemmatization

In [17]:
# Load SpaCy (disable unnecessary components for faster performance)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def lemmatize_and_clean_texts(
    texts: List[str],
    batch_size: int = 2000,
    n_process: int = 4
) -> List[str]:
    """
    Lemmatize a list of texts using spaCy with multiprocessing
    and clean them for keyword matching.
    
    Cleaning:
    - Replace " - " with "-" to handle multi-word keywords like "pet-friendly".
    - Strip leading/trailing whitespace.
    """
    clean_texts = [(t if isinstance(t, str) else "") for t in texts]
    lemmatized = []
    for doc in nlp.pipe(clean_texts, batch_size=batch_size, n_process=n_process):
        text = " ".join([token.lemma_ for token in doc])
        text = text.replace(" - ", "-").strip()
        lemmatized.append(text)
    return lemmatized


def lemmatize_column_fast(
    df: pl.DataFrame, 
    col_name: str, 
    new_col_name: str = None, 
    chunk_size: int = 50000, 
    n_process: int = 4
) -> pl.DataFrame:
    """
    Lemmatize a Polars DataFrame column efficiently in batches with multiprocessing.
    """
    new_col_name = new_col_name or f"{col_name}_lemmatized"
    texts = df.select(col_name).to_series().to_list()
    lemmatized_chunks = []

    for i in tqdm(range(0, len(texts), chunk_size), desc=f"Lemmatizing {col_name}"):
        chunk = texts[i:i + chunk_size]
        lemmatized_chunks.extend(lemmatize_and_clean_texts(chunk, n_process=n_process))

    return df.with_columns(pl.Series(name=new_col_name, values=lemmatized_chunks))


def lemmatize_categories(
    categories: Dict[str, List[str]]
) -> Dict[str, List[str]]:
    """
    Lemmatize all keywords in category dictionary.
    """
    return {
        category: list(set(lemmatize_and_clean_texts(keywords, batch_size=100, n_process=1)))
        for category, keywords in categories.items()
    }



## Keywords extraction

In [18]:
import re
import concurrent.futures
from typing import Dict, List
import polars as pl
from tqdm import tqdm

def extract_all_categories(
    df: pl.DataFrame,
    col_name: str,
    categories: Dict[str, List[str]],
    exclusions: Dict[str, List[str]] = None,
    n_process: int = 4,
    id_col: str = "id"
) -> pl.DataFrame:
    """
    Extract reviews matching category keywords, keeping reviews if at least one keyword remains
    after temporarily removing exclusion phrases.

    Args:
        df (pl.DataFrame): Input dataframe.
        col_name (str): Column containing the text.
        categories (Dict[str, List[str]]): {category: [lemmatized keywords]}.
        exclusions (Dict[str, List[str]]): {category: [phrases to exclude]}.
        n_process (int): Number of threads.
        id_col (str): Column containing unique IDs.

    Returns:
        pl.DataFrame: DataFrame with columns [id, review, keywords_found, category].
    """

    texts = df.select([id_col, col_name]).to_pandas()
    exclusions = exclusions or {}

    def normalize_keyword(kw: str) -> str:
        return kw.strip().replace(" - ", "-")

    def make_regex(kw: str) -> str:
        kw = normalize_keyword(kw)
        if " " in kw or "-" in kw:
            return re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+")
        else:
            return r"\b" + re.escape(kw) + r"\b"

    def process_category(category: str, keywords: List[str], excluded_phrases: List[str]):
        results = []
        for _, row in texts.iterrows():
            text = row[col_name]
            review_id = row[id_col]
            if not isinstance(text, str):
                continue

            temp_text = text
            # Temporarily remove the exclusion phrases
            if excluded_phrases:
                for ex in excluded_phrases:
                    temp_text = re.sub(make_regex(ex), " ", temp_text, flags=re.IGNORECASE)

            # Check if at least one keyword remains
            matched_keywords = [kw for kw in keywords if re.search(make_regex(kw), temp_text, flags=re.IGNORECASE)]

            if matched_keywords:
                results.append((review_id, text, ", ".join(matched_keywords), category))

        return results

    # Parallel processing
    all_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_process) as executor:
        futures = {executor.submit(process_category, cat, kws, exclusions.get(cat, [])): cat
                   for cat, kws in categories.items()}
        for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Keyword extraction"):
            all_results.extend(fut.result())

    if not all_results:
        return pl.DataFrame(schema={
            id_col: pl.Int64,
            "review": pl.Utf8,
            "keywords_found": pl.Utf8,
            "category": pl.Utf8
        })

    df_filtered = pl.DataFrame({
        id_col: [r[0] for r in all_results],
        "review": [r[1] for r in all_results],
        "keywords_found": [r[2] for r in all_results],
        "category": [r[3] for r in all_results]
    })

    logger.info(f"Extracted {df_filtered.shape[0]} matching reviews across {len(categories)} categories.")
    return df_filtered

## Main

In [19]:
def process_pipeline(input_csv: str, column_name: str, output_csv: str, nb_process:int):

    df = pl.read_csv(input_csv)
    logger.info(f"DataFrame {os.path.splitext(os.path.basename(input_csv))[0]} loaded : {df.shape[0]} rows x {df.shape[1]} columns")
    
    # Load categories
    with open("../data/categories.json", "r", encoding="utf-8") as f:
        categories = json.load(f)

    # Load exclusions
    with open("../data/exclusions.json", "r", encoding="utf-8") as f:
        exclusions = json.load(f)

    logger.info("Keywords and excluded words loaded")

    df_clean = remove_stopwords(df, column_name)
    logger.info("Stop words have been removed from reviews")

    #lemmatized_categories = lemmatize_categories(categories)
    #lemmatized_exclusions = lemmatize_categories(exclusions)
    #logger.info("Keywords and excluded words have been lemmatized")

    #df_lem = lemmatize_column_fast(df_clean, column_name, n_process=NUM_THREAD)
    #logger.info("DataFrame has been lemmatized")

    exclusions_cleaned= remove_stopwords_categories(exclusions)
    categories_cleaned = remove_stopwords_categories(categories)
    logger.info("Stop words have been removed from keywords")

    df_keywords = extract_all_categories(
        df_clean, 
        col_name = column_name,
        categories=categories_cleaned,
        exclusions=exclusions_cleaned,
        n_process=nb_process
    )
    logger.info("Keywords extraction finished")

    # Save
    df_keywords.write_csv(output_csv)
    logger.info(f"DataFrame saved to {output_csv}")


In [20]:
# if __name__ == "__main__":

#     input_path="../data/processed/data_clean/data_yelp_reviews_cleaned.csv"
#     name_column = "text"
#     output_path = "../data/processed/data_categorized/key_words_data_yelp_reviews.csv"
#     logger.info("Beginning of the pipeline:")
#     process_pipeline(input_path,name_column,output_path,NUM_THREAD)
#     logger.info("End of the pipeline")

In [None]:
input_path="../data/processed/data_clean/data_yelp_reviews_cleaned.csv"
name_column = "text"
output_path = "../data/processed/data_categorized/key_words_data_yelp_reviews.csv"
logger.info("Beginning of the pipeline:")
process_pipeline(input_path,name_column,output_path,NUM_THREAD)
logger.info("End of the pipeline")

import os
import time

# Temps d'attente en secondes (ex: 1 heure = 3600 secondes)
cooldown_time = 60
time.sleep(cooldown_time)

# Éteindre le PC
os.system("shutdown /s /t 0")

2025-11-20 23:50:18,286 - INFO - Beginning of the pipeline:
2025-11-20 23:50:23,445 - INFO - DataFrame data_yelp_reviews_cleaned loaded : 6974127 rows x 10 columns
2025-11-20 23:50:23,445 - INFO - Keywords and excluded words loaded
2025-11-21 02:12:27,048 - INFO - Stop words have been removed from reviews
2025-11-21 02:12:27,050 - INFO - Stop words have been removed from keywords
Keyword extraction: 100%|██████████| 3/3 [2:57:36<00:00, 3552.23s/it]  
2025-11-21 05:10:11,493 - INFO - Extracted 949472 matching reviews across 3 categories.
2025-11-21 05:10:12,119 - INFO - Keywords extraction finished
2025-11-21 05:10:12,544 - INFO - DataFrame saved to ../data/processed/data_categorized/key_words_data_yelp_reviews.csv
2025-11-21 05:10:12,551 - INFO - End of the pipeline


In [None]:
# from pathlib import Path

# # Dossiers
# input_folder = Path("../data/processed/data_clean")
# output_folder = Path("../data/processed/data_categorized")
# output_folder.mkdir(parents=True, exist_ok=True)  # créer le dossier si besoin

# # Lancer le pipeline sur tous les fichiers CSV
# for file_path in input_folder.glob("*.csv"):
#     stem = file_path.stem  # ex: data_european_restaurant_reviews_cleaned

#     # Ignorer certains fichiers
#     if "accessiblego" in stem.lower():
#         logger.info(f"Skipping file {file_path.name} (contains 'accessiblego')")
#         continue

#     # Choix de la colonne à traiter
#     column_name = "review"
#     if "yelp" in stem.lower():
#         column_name = "text"

#     output_file = output_folder / f"key_words_{stem}.csv"

#     logger.info(f"Processing file {file_path.name}...")
#     process_pipeline(str(file_path), column_name, str(output_file), NUM_THREAD)
#     logger.info(f"Finished processing {file_path.name}")

# import os
# import time

# # Temps d'attente en secondes (ex: 1 heure = 3600 secondes)
# cooldown_time = 450
# time.sleep(cooldown_time)

# # Éteindre le PC
# os.system("shutdown /s /t 0")
