# Pets reviews extraction 

The objective is to extract relevant reviews to understand the profiles of travelers with animals.

### Word regex extraction

The keywords list is not exhaustive but tend to extract the main part of the targeted reviews.

In [1]:
import polars as pl
import spacy
import re

In [2]:
# import dataset
df_booking = pl.read_csv('../data/processed/data_totale_booking.csv')

In [19]:
# key words
categories = {
    "pets": [
        "dog", "cat", "pet", "animal", "rabbit", "hamster", "ferret", "bird", "pet-friendly",
        "animals allowed", "dog-friendly", "cat-friendly", "pet welcome"
    ]
}

# Load the english model of SpaCy

# To uncomment to run on GPU
# spacy.prefer_gpu() 
nlp = spacy.load("en_core_web_sm")

df_test = df_booking.head(1000)

# Lemmatization via SpaCy
def lemmatize_text(text):
    doc = nlp(text.lower())  # minuscules + NLP
    return " ".join(token.lemma_ for token in doc)

df_lemmatized = df_test.with_columns([
    pl.col("review_positive").map_elements(lemmatize_text).alias("review_positive"),
    pl.col("review_negative").map_elements(lemmatize_text).alias("review_negative")
])

In [None]:
# Concatenate both review columns into a single text column for counting/filtering
df_lemmatized = df_lemmatized.with_columns(
    (pl.col("review_positive") + " " + pl.col("review_negative")).alias("text")
)

# Initialize dictionaries for counts
keyword_summary = {}
category_summary = {}

# List to store filtered rows
filtered_rows = []

for category, keywords in categories.items():
    category_total = 0
    # Create a regex for the category
    regex = r"\b(" + "|".join(re.escape(kw).replace("\\-", "[-\\s]").replace("\\ ", "\\s+") for kw in keywords) + r")\b"

    
    # Filter rows containing at least one keyword
    matches = df_lemmatized.filter(pl.col("text").str.contains(regex))

    # Function to identify which keywords are found in each comment
    def find_keywords(text):
        found = [kw for kw in keywords if re.search(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b", text)]
        return ", ".join(found)

    # Add a new column with the keywords found
    matches = matches.with_columns(
        pl.col("text").map_elements(find_keywords).alias("keywords_found")
    )

    # Append to the list of filtered rows
    filtered_rows.append(matches)

    # Count keyword occurrences
    for kw in keywords:
        count = (
            df_lemmatized
            .select(pl.col("text").str.count_matches(r"\b" + re.escape(kw).replace("\\ ", "\\s+") + r"\b").sum().alias("total"))
            .item()
        )
        keyword_summary[kw] = int(count)
        category_total += count
    category_summary[category] = int(category_total)

# Concatenate all filtered rows into a new DataFrame
df_filtered = pl.concat(filtered_rows)

# Print summary statistics
print("\n=== Category Count ===\n")
for cat, count in category_summary.items():
    print(f"{cat}: {count}")

print("\n=== Keyword Details ===\n")
for kw, count in sorted(keyword_summary.items(), key=lambda x: (-x[1], x[0])):
    print(f"{kw}: {count}")

print("\n=== Preview of Filtered DataFrame ===")
print(df_filtered.head(5))


=== Category Count ===

pets: 23

=== Keyword Details ===

dog: 16
cat: 3
animal: 2
pet: 2
animals allowed: 0
bird: 0
cat-friendly: 0
dog-friendly: 0
ferret: 0
hamster: 0
pet welcome: 0
pet-friendly: 0
rabbit: 0

=== Preview of Filtered DataFrame ===
shape: (5, 19)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ review_ti ┆ review_po ┆ review_ne ┆ review_sc ┆ … ┆ location_ ┆ location_ ┆ text      ┆ keywords │
│ tle       ┆ sitive    ┆ gative    ┆ ore       ┆   ┆ is_beach  ┆ is_city_c ┆ ---       ┆ _found   │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ enter     ┆ str       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ f64       ┆   ┆ i64       ┆ ---       ┆           ┆ str      │
│           ┆           ┆           ┆           ┆   ┆           ┆ i64       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Great     ┆ room be   ┆ 