# Dynamic filters

## Import Packages

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
import numpy as np


## Import Data

In [5]:
df = pd.read_csv("./data/TravelDataset.csv")

In [6]:
df.head()

Unnamed: 0,id,city,country,region,short_description,latitude,longitude,avg_temp_monthly,ideal_durations,budget_level,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion
0,c54acf38-3029-496b-8c7a-8343ad82785c,Milan,Italy,europe,"Chic streets lined with fashion boutiques, his...",45.464194,9.189635,"{""1"":{""avg"":3.7,""max"":7.8,""min"":0.4},""2"":{""avg...","[""Short trip"",""One week""]",Luxury,5,2,2,1,4,5,3,5,2
1,0bd12654-ed64-424e-a044-7bc574bcf078,Yasawa Islands,Fiji,oceania,"Crystal-clear waters, secluded beaches, and vi...",-17.290947,177.125786,"{""1"":{""avg"":28,""max"":30.8,""min"":25.8},""2"":{""av...","[""Long trip"",""One week""]",Luxury,2,4,5,5,2,3,4,1,5
2,73036cda-9134-46fc-a2c6-807782d59dfb,Whistler,Canada,north_america,Snow-capped peaks and lush forests create a se...,50.11719,-122.954302,"{""1"":{""avg"":-2.5,""max"":0.4,""min"":-5.5},""2"":{""a...","[""Short trip"",""Weekend"",""One week""]",Luxury,3,5,5,2,3,3,4,2,4
3,3872c9c0-6b6e-49e1-9743-f46bfe591b86,Guanajuato,Mexico,north_america,Winding cobblestone streets and colorful facad...,20.9877,-101.0,"{""1"":{""avg"":15.5,""max"":22.8,""min"":8.7},""2"":{""a...","[""Weekend"",""One week"",""Short trip""]",Mid-range,5,3,3,1,3,4,3,4,2
4,e1ebc1b6-8798-422d-847a-22016faff3fd,Surabaya,Indonesia,asia,Bustling streets filled with the aroma of loca...,-7.245972,112.737827,"{""1"":{""avg"":28.1,""max"":32.5,""min"":25.5},""2"":{""...","[""Short trip"",""Weekend""]",Budget,4,3,3,2,3,4,3,4,2


In [7]:
# Convert 'ideal_durations' to individual one-hot encoded columns
df["ideal_durations"] = df["ideal_durations"].apply(eval)  # if it's stored as a string
duration_ohe = df["ideal_durations"].explode().str.get_dummies().groupby(level=0).max()
df = df.join(duration_ohe).drop(columns=["ideal_durations"])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 560 non-null    object 
 1   city               560 non-null    object 
 2   country            560 non-null    object 
 3   region             560 non-null    object 
 4   short_description  560 non-null    object 
 5   latitude           560 non-null    float64
 6   longitude          560 non-null    float64
 7   avg_temp_monthly   560 non-null    object 
 8   budget_level       560 non-null    object 
 9   culture            560 non-null    int64  
 10  adventure          560 non-null    int64  
 11  nature             560 non-null    int64  
 12  beaches            560 non-null    int64  
 13  nightlife          560 non-null    int64  
 14  cuisine            560 non-null    int64  
 15  wellness           560 non-null    int64  
 16  urban              560 non

In [9]:
df.describe()

Unnamed: 0,latitude,longitude,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion,Day trip,Long trip,One week,Short trip,Weekend
count,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0
mean,22.502186,7.914665,3.85,3.178571,3.728571,2.380357,3.019643,3.792857,3.073214,3.146429,3.028571,0.035714,0.094643,0.719643,0.955357,0.751786
std,27.980022,78.813803,0.81291,0.79819,0.90392,1.435547,0.921599,0.679329,0.592134,1.018604,0.989699,0.185743,0.292983,0.449575,0.206703,0.432363
min,-54.807306,-175.201808,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,5.268054,-64.439118,3.0,3.0,3.0,1.0,2.0,3.0,3.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0
50%,31.793618,10.711854,4.0,3.0,4.0,2.0,3.0,4.0,3.0,3.0,3.0,0.0,0.0,1.0,1.0,1.0
75%,43.673199,50.020162,4.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,0.0,0.0,1.0,1.0,1.0
max,78.719852,179.332896,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0


# Generate Dynamic Filters

## Get Top Informative Features

In [51]:
# Simulate selecting a subset of destinations
df["label"] = 0
df.loc[df.sample(frac=0.8).index, "label"] = 1  # simulate "visible" destinations


In [52]:
df.describe()

Unnamed: 0,latitude,longitude,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion,Day trip,Long trip,One week,Short trip,Weekend,label
count,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0
mean,22.502186,7.914665,3.85,3.178571,3.728571,2.380357,3.019643,3.792857,3.073214,3.146429,3.028571,0.035714,0.094643,0.719643,0.955357,0.751786,0.8
std,27.980022,78.813803,0.81291,0.79819,0.90392,1.435547,0.921599,0.679329,0.592134,1.018604,0.989699,0.185743,0.292983,0.449575,0.206703,0.432363,0.400358
min,-54.807306,-175.201808,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.268054,-64.439118,3.0,3.0,3.0,1.0,2.0,3.0,3.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0
50%,31.793618,10.711854,4.0,3.0,4.0,2.0,3.0,4.0,3.0,3.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0
75%,43.673199,50.020162,4.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0
max,78.719852,179.332896,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
selected_destinations = df[df["label"]==1]
selected_destinations

Unnamed: 0,id,city,country,region,short_description,latitude,longitude,avg_temp_monthly,budget_level,culture,...,cuisine,wellness,urban,seclusion,Day trip,Long trip,One week,Short trip,Weekend,label
0,c54acf38-3029-496b-8c7a-8343ad82785c,Milan,Italy,europe,"Chic streets lined with fashion boutiques, his...",45.464194,9.189635,"{""1"":{""avg"":3.7,""max"":7.8,""min"":0.4},""2"":{""avg...",Luxury,5,...,5,3,5,2,0,0,1,1,0,1
1,0bd12654-ed64-424e-a044-7bc574bcf078,Yasawa Islands,Fiji,oceania,"Crystal-clear waters, secluded beaches, and vi...",-17.290947,177.125786,"{""1"":{""avg"":28,""max"":30.8,""min"":25.8},""2"":{""av...",Luxury,2,...,3,4,1,5,0,1,1,0,0,1
3,3872c9c0-6b6e-49e1-9743-f46bfe591b86,Guanajuato,Mexico,north_america,Winding cobblestone streets and colorful facad...,20.987700,-101.000000,"{""1"":{""avg"":15.5,""max"":22.8,""min"":8.7},""2"":{""a...",Mid-range,5,...,4,3,4,2,0,0,1,1,1,1
4,e1ebc1b6-8798-422d-847a-22016faff3fd,Surabaya,Indonesia,asia,Bustling streets filled with the aroma of loca...,-7.245972,112.737827,"{""1"":{""avg"":28.1,""max"":32.5,""min"":25.5},""2"":{""...",Budget,4,...,4,3,4,2,0,0,0,1,1,1
6,20f80ed8-ce7f-43ad-a34d-a0cb156f5d4e,Windhoek,Namibia,africa,"A blend of modernity and tradition, with bustl...",-22.533560,17.045478,"{""1"":{""avg"":23.9,""max"":32.3,""min"":17.8},""2"":{""...",Mid-range,3,...,3,3,3,4,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,f38b9348-e581-4de5-bafa-1efb5c28e458,Nelson,New Zealand,oceania,Sun-drenched coastal charm with vibrant market...,-41.271085,173.283676,"{""1"":{""avg"":18.6,""max"":22,""min"":14.8},""2"":{""av...",Mid-range,3,...,3,3,3,4,0,0,1,1,0,1
555,778d28df-a4fa-4328-896e-4a9f80216fda,Maun,Botswana,africa,"A gateway to the Okavango Delta, offering a se...",-19.986095,23.422435,"{""1"":{""avg"":26.6,""max"":32,""min"":21.2},""2"":{""av...",Mid-range,3,...,3,3,2,4,0,0,1,1,0,1
557,8c8c7203-2a45-44ba-9fb2-b5158104375e,Manchester,United Kingdom,europe,"Industrial heritage meets modern creativity, w...",53.479489,-2.245115,"{""1"":{""avg"":4.7,""max"":7.1,""min"":2},""2"":{""avg"":...",Mid-range,4,...,4,3,4,2,0,0,1,1,1,1
558,ba72b976-10f9-4415-a818-32cf17d8e649,Copenhagen,Denmark,europe,"Charming canals, vibrant neighborhoods, and a ...",55.686724,12.570072,"{""1"":{""avg"":2.6,""max"":4.2,""min"":0.6},""2"":{""avg...",Mid-range,5,...,4,3,5,2,0,0,1,1,1,1


In [54]:
def get_dynamic_filters(dataframe, top_n=5):
    df = dataframe.copy()
        
    # Convert categorical columns
    le_budget = LabelEncoder()
    le_region = LabelEncoder()
    le_country = LabelEncoder()
    le_city = LabelEncoder()
    df["budget_level_enc"] = le_budget.fit_transform(df["budget_level"])
    df["region_enc"] = le_region.fit_transform(df["region"])
    df["country_enc"] = le_country.fit_transform(df["country"])
    df["city_enc"] = le_city.fit_transform(df["city"])
    
    # use these features to analyse information gain  
    feature_cols = [
        "Day trip", "Long trip", "Short trip", "One week", "Weekend",
        "culture", "adventure", "nature", "beaches", "nightlife", "cuisine", "wellness", "urban", "seclusion", 
        "region_enc", "country_enc", "city_enc", "budget_level_enc"
    ]

    X = df[feature_cols]
    y = df["label"]

    # Compute mutual information (entropy-based score)
    info_gains = mutual_info_classif(X, y, discrete_features='auto')


    gain_df = pd.DataFrame({
        "feature": feature_cols,
        "info_gain": info_gains
    }).sort_values(by="info_gain", ascending=False)

    
    # Filter to rows where label == 1
    selected = df[df["label"] == 1]
    
    # Add column with unique values from rows where label == 1
    gain_df["unique_values_label_1"] = gain_df["feature"].apply(lambda col: selected[col].unique())

    # Sort and return
    gain_df = gain_df.sort_values(by="info_gain", ascending=False)
    return gain_df.head(top_n)

In [55]:
top_filters = get_dynamic_filters(selected_destinations,5)
print(top_filters)

      feature     info_gain unique_values_label_1
3    One week  2.232143e-03                [1, 0]
2  Short trip  1.116071e-03                [1, 0]
1   Long trip  3.330669e-16                [0, 1]
0    Day trip  3.330669e-16                [0, 1]
4     Weekend  3.330669e-16                [0, 1]


## Generate Filters