# Dynamic filters

## Import Packages

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

## Import Data

In [3]:
df = pd.read_csv("../data/TravelDataset.csv")

In [4]:
df.head()

In [5]:
# Convert 'ideal_durations' to individual one-hot encoded columns
df["ideal_durations"] = df["ideal_durations"].apply(eval)  # if it's stored as a string
duration_ohe = df["ideal_durations"].explode().str.get_dummies().groupby(level=0).max()
df = df.join(duration_ohe).drop(columns=["ideal_durations"])

In [6]:
df.info()

In [7]:
df.describe()

In [None]:
df.to_csv("..data/structured_data.csv")

# Generate Dynamic Filters

## Get Top Informative Features

In [95]:
# Simulate selecting a subset of destinations
df["label"] = 0
df.loc[df.sample(frac=0.8).index, "label"] = 1  # simulate "visible" destinations


In [96]:
df.describe()

In [97]:
selected_destinations = df[df["label"]==1]
selected_destinations

In [None]:
def get_dynamic_filters(dataframe, top_n=5):
    df = dataframe.copy()

    # Fit LabelEncoders for categorical features
    le_budget = LabelEncoder()
    le_region = LabelEncoder()
    
    df["budget_level_enc"] = le_budget.fit_transform(df["budget_level"])
    df["region_enc"] = le_region.fit_transform(df["region"])

    # Features to evaluate
    feature_cols = [
        "Day trip", "Long trip", "Short trip", "One week", "Weekend",
        "culture", "adventure", "nature", "beaches", "nightlife",
        "cuisine", "wellness", "urban", "seclusion", 
        "region_enc", "budget_level_enc"
    ]

    # Map encoders
    label_encoders = {
        "budget_level_enc": le_budget,
        "region_enc": le_region,
    }

    X = df[feature_cols]
    y = df["label"]
    selected = df[df["label"] == 1]

    info_gains = mutual_info_classif(X, y, discrete_features='auto')

    # Generate readable unique values
    unique_values = []
    for col in feature_cols:
        values = selected[col].unique()
        if col in label_encoders:
            encoder = label_encoders[col]
            decoded = encoder.inverse_transform(values)
            unique_values.append(list(decoded))
        else:
            # Keep raw values for binary or numeric features
            unique_values.append(list(values))

    # Build final result
    gain_df = pd.DataFrame({
        "feature": feature_cols,
        "info_gain": info_gains,
        "unique_values": unique_values
    })

    gain_df = gain_df.sort_values(by="info_gain", ascending=False)
    return gain_df.head(top_n)


In [191]:
top_features_df = get_dynamic_filters(selected_destinations,5)
print(top_features_df)

## Generate Filters

In [192]:
import os
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import openai 
from langchain.prompts import ChatPromptTemplate
from openai import OpenAI
from pydantic import BaseModel
from typing import List, Literal, Optional


In [193]:

client = OpenAI()


In [194]:
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

In [195]:
def format_features(top_features_df):
    feature_info = []
    for _, row in top_features_df.iterrows():
        feature = row["feature"]
        values = row["unique_values"]
        feature_info.append(f"Feature name:'{feature}'. Feature values: {values}")
    return feature_info

In [None]:
class FilterSuggestion(BaseModel):
    question: str
    feature: str
    type: Literal["binary", "categorical"]
    values: List[str]
    value_meanings: Optional[dict[str, str]]

class FilterSuggestions(BaseModel):
    filters: List[FilterSuggestion]

In [219]:
PROMPT_TEMPLATE = """
You are a travel assistant that creates filter questions for a travel destination recommendation system.

Each feature in the dataset has a list of unique values:
- Some features are binary (e.g., [0, 1]) and represent yes/no preferences.
- Others are categorical with multiple values (e.g., [1, 2, 3, 4, 5]), representing intensity or levels of interest.

Your task:
1. Identify the type of each feature (`binary` or `categorical`).
2. For each, create a meaningful **filter question** in natural language.
3. For binary: map 0 → No, 1 → Yes
4. For categorical: assume values range from 1 (low) to 5 (high) and explain meanings if possible.
5. Define the value_meanings in the following format: "1: 'Minimal seclusion', 2: 'Some seclusion', 3: 'Moderate seclusion', 4: 'High seclusion', 5: 'Very high seclusion'"
6. Return 5 filters.

Here are the features and their relevant values:
{feature_info}

You must output a list of JSON-like objects using the exact structure below, one per feature.

Do not include any extra text, explanations, or commentary.

Each object must include all the following fields and **no field can be null or missing**:

- feature_name (str): The name of the feature.
- value_meanings (dict[str, str]): A mapping of each unique feature value to a human-readable description. You must provide a description for each value - they must never be 'None', empty, or partially filled.

Respond strictly with a JSON array of objects matching this format.
"""


In [221]:
# Generate prompt and call OpenAI via LangChain
def generate_filters_via_openai(top_features_df):
    feature_info = format_features(top_features_df)

    messages = [
    {
        "role": "system",
        "content": "You are a travel assistant helping users select destinations. Based on the most informative features and their values from a filtered dataset, generate 5 dynamic filters in JSON format to help users refine their choices. Return exactly 5 filter suggestions in JSON format under a `filters` field.",
    },
    {
        "role": "user",
        "content": PROMPT_TEMPLATE.format(feature_info=feature_info)
    },
    ]

    response = client.responses.parse(
        model="gpt-4o-mini",
        temperature=0.8,
        input=messages,
        text_format=FilterSuggestions
    )

    filters = response.output_parsed.filters

    return filters

# Usage
filters = generate_filters_via_openai(top_features_df)

In [222]:
filters