# Pets reviews extraction 

The objective is to extract relevant reviews to understand the profiles of travelers with pets.

## Word regex extraction

The keywords list is not exhaustive but tend to extract the main part of the targeted reviews.

In [1]:
import polars as pl
import spacy
import re

In [2]:
# import dataset
df_booking = pl.read_csv('../data/processed/data_totale_booking.csv')
df_yelp = pl.read_ndjson('../data/original/yelp_dataset/yelp_academic_dataset_review.json')

In [15]:
# key words
categories = {
    "pets": [
        "dog", "cat", "pet", "animal", "rabbit", "hamster", "ferret", "bird", "pet-friendly",
        "animals allowed", "dog-friendly", "cat-friendly", "pet welcome", "pup", "dog bowl"
    ]
}

In [16]:
# Lemmatization

# Load the english model of SpaCy
# spacy.prefer_gpu()            # To uncomment to run on GPU
nlp = spacy.load("en_core_web_sm")

# Lemmatization via SpaCy
def lemmatize_text(text):
    doc = nlp(text.lower())  # minuscules + NLP
    return " ".join(token.lemma_ for token in doc)

In [17]:
# Lemmatize categories so that words can match each other.
lemmatized_categories = {
    category: [lemmatize_text(kw) for kw in keywords]
    for category, keywords in categories.items()
}

print(lemmatized_categories)

{'pets': ['dog', 'cat', 'pet', 'animal', 'rabbit', 'hamster', 'ferret', 'bird', 'pet - friendly', 'animal allow', 'dog - friendly', 'cat - friendly', 'pet welcome', 'pup', 'dog bowl']}


### Booking dataset

In [7]:
df_test = df_booking.head(100000)

df_lemmatized = df_test.with_columns([
    pl.col("review_positive").map_elements(lemmatize_text).alias("review_positive"),
    pl.col("review_negative").map_elements(lemmatize_text).alias("review_negative")
])

In [20]:
import spacy
import polars as pl
from tqdm import tqdm

# --- 1. Charger SpaCy ---
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# --- 2. Fonction pour lemmatiser une liste de textes ---
def lemmatize_texts(texts, batch_size=1000, n_process=4):
    lemmatized = []
    # SpaCy ne supporte pas None → filtrer / remplacer
    clean_texts = [(t if isinstance(t, str) else "") for t in texts]
    for doc in nlp.pipe(clean_texts, batch_size=batch_size, n_process=n_process):
        lemmatized.append(" ".join([token.lemma_ for token in doc]))
    return lemmatized

# --- 3. Fonction pour traiter une colonne Polars ---
def lemmatize_column_fast(df, col_name, chunk_size=5000):
    lemmatized_chunks = []
    texts = df.select(col_name).to_series().to_list()
    
    for i in tqdm(range(0, len(texts), chunk_size), desc=f"Lemmatizing {col_name}"):
        chunk = texts[i:i+chunk_size]
        lemmatized_chunks.extend(lemmatize_texts(chunk))
    
    return pl.Series(name=col_name, values=lemmatized_chunks)

# --- 4. Exemple avec tes colonnes ---
df_test = df_booking.head(50000)

df_lemmatized = df_test.with_columns([
    lemmatize_column_fast(df_test, "review_positive"),
    lemmatize_column_fast(df_test, "review_negative")
])


Lemmatizing review_positive:   0%|          | 0/10 [00:09<?, ?it/s]


KeyboardInterrupt: 

In [18]:
# Concatenate both review columns into a single text column for counting/filtering
df_lemmatized = df_lemmatized.with_columns(
    (pl.col("review_positive") + " " + pl.col("review_negative")).alias("text")
)

# Initialize dictionaries for counts
keyword_summary = {}
category_summary = {}

# List to store filtered rows
filtered_rows = []

for category, keywords in lemmatized_categories.items():
    category_total = 0
    # Create a regex for the category
    regex = r"\b(" + "|".join(re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+") for kw in keywords) + r")\b"

    # Filter rows containing at least one keyword
    matches = df_lemmatized.filter(pl.col("text").str.contains(regex))

    # Function to identify which keywords are found in each comment
    def find_keywords(text):
        found = [kw for kw in keywords if re.search(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b", text)]
        return ", ".join(found)

    # Add a new column with the keywords found
    matches = matches.with_columns(
        pl.col("text").map_elements(find_keywords).alias("keywords_found")
    )

    # Append to the list of filtered rows
    filtered_rows.append(matches)

    # Count keyword occurrences
    for kw in keywords:
        count = (
            df_lemmatized
            .select(pl.col("text").str.count_matches(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b").sum().alias("total"))
            .item()
        )
        keyword_summary[kw] = int(count)
        category_total += count
    category_summary[category] = int(category_total)

# Concatenate all filtered rows into a new DataFrame
df_filtered = pl.concat(filtered_rows)

# Print summary statistics
print("\n=== Category Count ===\n")
for cat, count in category_summary.items():
    print(f"{cat}: {count}")

print("\n=== Keyword Details ===\n")
for kw, count in sorted(keyword_summary.items(), key=lambda x: (-x[1], x[0])):
    print(f"{kw}: {count}")

print("\n=== Preview of Filtered DataFrame ===")
print("Size of dataset", df_filtered.shape[0])


=== Category Count ===

pets: 1953

=== Keyword Details ===

dog: 1048
pet: 303
bird: 233
animal: 161
cat: 159
rabbit: 16
pet - friendly: 10
pup: 9
dog - friendly: 8
dog bowl: 5
pet welcome: 1
animal allow: 0
cat - friendly: 0
ferret: 0
hamster: 0

=== Preview of Filtered DataFrame ===
Size of dataset 1465


In [12]:
# To save the new dataset
df_filtered.write_csv("../data/processed/data_pet_booking.csv")

### Yelp dataset

In [27]:
df_test = df_yelp.head(10000)

df_lemmatized = df_test.with_columns(
    pl.col("text").map_elements(lemmatize_text).alias("text")
)

In [28]:
# Initialize dictionaries for counts
keyword_summary = {}
category_summary = {}

# List to store filtered rows
filtered_rows = []

for category, keywords in lemmatized_categories.items():
    category_total = 0
    # Create a regex for the category
    regex = r"\b(" + "|".join(re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+") for kw in keywords) + r")\b"

    # Filter rows containing at least one keyword
    matches = df_lemmatized.filter(pl.col("text").str.contains(regex))

    # Function to identify which keywords are found in each comment
    def find_keywords(text):
        found = [kw for kw in keywords if re.search(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b", text)]
        return ", ".join(found)

    # Add a new column with the keywords found
    matches = matches.with_columns(
        pl.col("text").map_elements(find_keywords).alias("keywords_found")
    )

    # Append to the list of filtered rows
    filtered_rows.append(matches)

    # Count keyword occurrences
    for kw in keywords:
        count = (
            df_lemmatized
            .select(pl.col("text").str.count_matches(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b").sum().alias("total"))
            .item()
        )
        keyword_summary[kw] = int(count)
        category_total += count
    category_summary[category] = int(category_total)

# Concatenate all filtered rows into a new DataFrame
df_filtered = pl.concat(filtered_rows)

# Print summary statistics
print("\n=== Category Count ===\n")
for cat, count in category_summary.items():
    print(f"{cat}: {count}")

print("\n=== Keyword Details ===\n")
for kw, count in sorted(keyword_summary.items(), key=lambda x: (-x[1], x[0])):
    print(f"{kw}: {count}")

print("\n=== Preview of Filtered DataFrame ===")
print("Size of dataset", df_filtered.shape[0])


=== Category Count ===

pets: 511

=== Keyword Details ===

dog: 287
pet: 56
bird: 47
cat: 46
animal: 34
rabbit: 27
pup: 11
dog - friendly: 3
animal allow: 0
cat - friendly: 0
dog bowl: 0
ferret: 0
hamster: 0
pet - friendly: 0
pet welcome: 0

=== Preview of Filtered DataFrame ===
Size of dataset 277


In [29]:
# To save the new dataset
df_filtered.write_csv("../data/processed/data_pet_yelp.csv")