# Dynamic filters

## Import libraries

In [28]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
import numpy as np


In [31]:
df = pd.read_csv("./data/TravelDataset.csv")

In [32]:
df.head()

Unnamed: 0,id,city,country,region,short_description,latitude,longitude,avg_temp_monthly,ideal_durations,budget_level,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion
0,c54acf38-3029-496b-8c7a-8343ad82785c,Milan,Italy,europe,"Chic streets lined with fashion boutiques, his...",45.464194,9.189635,"{""1"":{""avg"":3.7,""max"":7.8,""min"":0.4},""2"":{""avg...","[""Short trip"",""One week""]",Luxury,5,2,2,1,4,5,3,5,2
1,0bd12654-ed64-424e-a044-7bc574bcf078,Yasawa Islands,Fiji,oceania,"Crystal-clear waters, secluded beaches, and vi...",-17.290947,177.125786,"{""1"":{""avg"":28,""max"":30.8,""min"":25.8},""2"":{""av...","[""Long trip"",""One week""]",Luxury,2,4,5,5,2,3,4,1,5
2,73036cda-9134-46fc-a2c6-807782d59dfb,Whistler,Canada,north_america,Snow-capped peaks and lush forests create a se...,50.11719,-122.954302,"{""1"":{""avg"":-2.5,""max"":0.4,""min"":-5.5},""2"":{""a...","[""Short trip"",""Weekend"",""One week""]",Luxury,3,5,5,2,3,3,4,2,4
3,3872c9c0-6b6e-49e1-9743-f46bfe591b86,Guanajuato,Mexico,north_america,Winding cobblestone streets and colorful facad...,20.9877,-101.0,"{""1"":{""avg"":15.5,""max"":22.8,""min"":8.7},""2"":{""a...","[""Weekend"",""One week"",""Short trip""]",Mid-range,5,3,3,1,3,4,3,4,2
4,e1ebc1b6-8798-422d-847a-22016faff3fd,Surabaya,Indonesia,asia,Bustling streets filled with the aroma of loca...,-7.245972,112.737827,"{""1"":{""avg"":28.1,""max"":32.5,""min"":25.5},""2"":{""...","[""Short trip"",""Weekend""]",Budget,4,3,3,2,3,4,3,4,2


In [None]:
# Convert 'ideal_durations' to individual one-hot encoded columns
df["ideal_durations"] = df["ideal_durations"].apply(eval)  # if it's stored as a string
duration_ohe = df["ideal_durations"].explode().str.get_dummies().groupby(level=0).max()
df = df.join(duration_ohe).drop(columns=["ideal_durations"])

In [34]:
df.describe()

Unnamed: 0,latitude,longitude,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion,Day trip,Long trip,One week,Short trip,Weekend
count,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0,560.0
mean,22.502186,7.914665,3.85,3.178571,3.728571,2.380357,3.019643,3.792857,3.073214,3.146429,3.028571,0.035714,0.094643,0.719643,0.955357,0.751786
std,27.980022,78.813803,0.81291,0.79819,0.90392,1.435547,0.921599,0.679329,0.592134,1.018604,0.989699,0.185743,0.292983,0.449575,0.206703,0.432363
min,-54.807306,-175.201808,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,5.268054,-64.439118,3.0,3.0,3.0,1.0,2.0,3.0,3.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0
50%,31.793618,10.711854,4.0,3.0,4.0,2.0,3.0,4.0,3.0,3.0,3.0,0.0,0.0,1.0,1.0,1.0
75%,43.673199,50.020162,4.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,0.0,0.0,1.0,1.0,1.0
max,78.719852,179.332896,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0


In [53]:
def get_dynamic_filters(dataframe, top_n=5):
    df = dataframe.copy()
        
    # Convert categorical columns
    le_budget = LabelEncoder()
    le_region = LabelEncoder()
    le_country = LabelEncoder()
    le_city = LabelEncoder()
    df["budget_level_enc"] = le_budget.fit_transform(df["budget_level"])
    df["region_enc"] = le_region.fit_transform(df["region"])
    df["country_enc"] = le_country.fit_transform(df["country"])
    df["city_enc"] = le_city.fit_transform(df["city"])
    
    # use these features to analyse information gain  
    feature_cols = [
        "Day trip", "Long trip", "Short trip", "One week", "Weekend",
        "culture", "adventure", "nature", "beaches", "nightlife", "cuisine", "wellness", "urban", "seclusion", 
        "region_enc", "country_enc", "city_enc", "budget_level_enc"
    ]

    # Simulate binary class: compare a subset of interest vs. rest
    df["label"] = 0
    df.loc[df.sample(frac=0.2).index, "label"] = 1  # simulate "visible" destinations

    X = df[feature_cols]
    y = df["label"]

    # Compute mutual information (entropy-based score)
    info_gains = mutual_info_classif(X, y, discrete_features='auto')
    gain_df = pd.DataFrame({
        "feature": feature_cols,
        "info_gain": info_gains
    }).sort_values(by="info_gain", ascending=False)

    return gain_df.head(top_n)


In [None]:
top_filters = get_dynamic_filters(df,5)
print(top_filters)

      feature  info_gain
10    cuisine   0.040319
1   Long trip   0.024832
8     beaches   0.009223
12      urban   0.007847
11   wellness   0.005795
