# Humans with special needs reviews extraction 

The objective is to extract relevant reviews to understand the profiles of travelers with special needs such as wheelchair, handicap, baby or children.

### Word regex extraction

The keywords list is not exhaustive but tend to extract the main part of the targeted reviews.

In [1]:
import polars as pl
import spacy
import re

In [2]:
# import dataset
df_booking = pl.read_csv('../data/processed/data_totale_booking.csv')
df_yelp = pl.read_ndjson('../data/original/yelp_dataset/yelp_academic_dataset_review.json')

In [3]:
# key words
categories = {
    "handicap": [
        "handicap", "wheelchair", "accessible", "braille", "ramp", "lift", "elevator",
        "disabled", "barrier-free", "accessible toilet", "toilet accessible",
        "mobility aid", "adapted", "hearing aid", "visual impairment", "accessible entrance"
    ],
    "children": [
        "child", "baby", "kid", "stroller", "son", "daughter", "toddler",
        "infant", "playground", "high chair", "changing table", "family-friendly",
        "childcare", "kids menu", "baby seat", "family","baby bed", "cot", "crib"
    ]
}

In [5]:
###################### Lemmatization ###################### 

# Load the english model of SpaCy
# spacy.prefer_gpu()            # To uncomment to run on GPU
nlp = spacy.load("en_core_web_sm")

# Lemmatization via SpaCy
def lemmatize_text(text):
    doc = nlp(text.lower())  # minuscules + NLP
    return " ".join(token.lemma_ for token in doc)

In [6]:
# Lemmatize categories so that words can match each other.
lemmatized_categories = {
    category: [lemmatize_text(kw) for kw in keywords]
    for category, keywords in categories.items()
}

print(lemmatized_categories)

{'handicap': ['handicap', 'wheelchair', 'accessible', 'braille', 'ramp', 'lift', 'elevator', 'disable', 'barrier - free', 'accessible toilet', 'toilet accessible', 'mobility aid', 'adapt', 'hear aid', 'visual impairment', 'accessible entrance'], 'children': ['child', 'baby', 'kid', 'stroller', 'son', 'daughter', 'toddler', 'infant', 'playground', 'high chair', 'change table', 'family - friendly', 'childcare', 'kids menu', 'baby seat', 'family', 'baby bed', 'cot', 'crib']}


### Booking dataset

In [7]:
df_test = df_booking.head(10000)

df_lemmatized = df_test.with_columns([
    pl.col("review_positive").map_elements(lemmatize_text).alias("review_positive"),
    pl.col("review_negative").map_elements(lemmatize_text).alias("review_negative")
])

In [8]:
# Concatenate both review columns into a single text column for counting/filtering
df_lemmatized = df_lemmatized.with_columns(
    (pl.col("review_positive") + " " + pl.col("review_negative")).alias("text")
)

# Dictionnaires pour les comptages
keyword_summary = {}
category_summary = {}

# Dictionnaire pour stocker les DataFrames filtrés par catégorie
filtered_dfs = {}

for category, keywords in lemmatized_categories.items():
    category_total = 0
    
    # Construire la regex de recherche
    regex = r"\b(" + "|".join(
        re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+")
        for kw in keywords
    ) + r")\b"

    # Filtrer les lignes contenant au moins un mot-clé
    matches = df_lemmatized.filter(pl.col("text").str.contains(regex))

    # Fonction pour identifier les mots-clés trouvés
    def find_keywords(text):
        found = [
            kw for kw in keywords
            if re.search(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b", text)
        ]
        return ", ".join(found)

    # Ajouter la colonne des mots-clés trouvés
    matches = matches.with_columns(
        pl.col("text").map_elements(find_keywords).alias("keywords_found")
    )

    # Sauvegarder ce DataFrame dans un dictionnaire
    filtered_dfs[category] = matches

    # Comptage des mots-clés
    for kw in keywords:
        count = (
            df_lemmatized
            .select(pl.col("text").str.count_matches(
                r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b"
            ).sum().alias("total"))
            .item()
        )
        keyword_summary[kw] = int(count)
        category_total += count

    category_summary[category] = int(category_total)

# === Impression du résumé ===
print("\n=== Category Count ===\n")
for cat, count in category_summary.items():
    print(f"{cat}: {count}")

print("\n=== Keyword Details ===\n")
for kw, count in sorted(keyword_summary.items(), key=lambda x: (-x[1], x[0])):
    print(f"{kw}: {count}")




=== Category Count ===

handicap: 349
children: 602

=== Keyword Details ===

family: 288
lift: 146
elevator: 133
kid: 127
child: 72
accessible: 51
son: 37
baby: 24
daughter: 22
playground: 10
wheelchair: 9
cot: 7
toddler: 7
ramp: 6
stroller: 4
crib: 2
disable: 2
adapt: 1
handicap: 1
high chair: 1
infant: 1
accessible entrance: 0
accessible toilet: 0
baby bed: 0
baby seat: 0
barrier - free: 0
braille: 0
change table: 0
childcare: 0
family - friendly: 0
hear aid: 0
kids menu: 0
mobility aid: 0
toilet accessible: 0
visual impairment: 0


In [9]:
# To save the new datasets
for cat, df_cat in filtered_dfs.items():
    filename = f"../data/processed/data_{cat}_booking.csv"
    df_cat.select([c for c in df_cat.columns if c != "text"]).write_csv(filename)
    print(f"\nSave : {filename} ({df_cat.shape[0]} lignes)")


Save : ../data/processed/data_handicap_booking.csv (301 lignes)

Save : ../data/processed/data_children_booking.csv (455 lignes)


### Yelp dataset

In [10]:
df_test = df_yelp.head(10000)

df_lemmatized = df_test.with_columns(
    pl.col("text").map_elements(lemmatize_text).alias("text")
)

In [11]:
# Dictionnaires pour les comptages
keyword_summary = {}
category_summary = {}

# Dictionnaire pour stocker les DataFrames filtrés par catégorie
filtered_dfs = {}

for category, keywords in lemmatized_categories.items():
    category_total = 0
    
    # Construire la regex de recherche
    regex = r"\b(" + "|".join(
        re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+")
        for kw in keywords
    ) + r")\b"

    # Filtrer les lignes contenant au moins un mot-clé
    matches = df_lemmatized.filter(pl.col("text").str.contains(regex))

    # Fonction pour identifier les mots-clés trouvés
    def find_keywords(text):
        found = [
            kw for kw in keywords
            if re.search(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b", text)
        ]
        return ", ".join(found)

    # Ajouter la colonne des mots-clés trouvés
    matches = matches.with_columns(
        pl.col("text").map_elements(find_keywords).alias("keywords_found")
    )

    # Sauvegarder ce DataFrame dans un dictionnaire
    filtered_dfs[category] = matches

    # Comptage des mots-clés
    for kw in keywords:
        count = (
            df_lemmatized
            .select(pl.col("text").str.count_matches(
                r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b"
            ).sum().alias("total"))
            .item()
        )
        keyword_summary[kw] = int(count)
        category_total += count

    category_summary[category] = int(category_total)

# === Impression du résumé ===
print("\n=== Category Count ===\n")
for cat, count in category_summary.items():
    print(f"{cat}: {count}")

print("\n=== Keyword Details ===\n")
for kw, count in sorted(keyword_summary.items(), key=lambda x: (-x[1], x[0])):
    print(f"{kw}: {count}")


=== Category Count ===

handicap: 109
children: 1517

=== Keyword Details ===

family: 522
kid: 428
son: 149
child: 142
daughter: 135
baby: 95
elevator: 51
lift: 20
accessible: 13
stroller: 13
toddler: 12
ramp: 9
playground: 8
wheelchair: 8
handicap: 5
infant: 5
high chair: 3
adapt: 2
cot: 2
crib: 2
change table: 1
disable: 1
accessible entrance: 0
accessible toilet: 0
baby bed: 0
baby seat: 0
barrier - free: 0
braille: 0
childcare: 0
family - friendly: 0
hear aid: 0
kids menu: 0
mobility aid: 0
toilet accessible: 0
visual impairment: 0


In [12]:
# To save the new datasets
for cat, df_cat in filtered_dfs.items():
    filename = f"../data/processed/data_{cat}_yelp.csv"
    df_cat.select([c for c in df_cat.columns if c != "text"]).write_csv(filename)
    print(f"\nSave : {filename} ({df_cat.shape[0]} lignes)")


Save : ../data/processed/data_handicap_yelp.csv (84 lignes)

Save : ../data/processed/data_children_yelp.csv (1004 lignes)
