In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load data
df = pd.read_csv("datasets/vibe_full_features_with_scraped_new_updated.csv")

# Drop missing image paths if needed
df = df.dropna(subset=['sentinel_jpg_path', 'osm_image_path'])

# Define features
numeric_features = ['dist_to_park', 'road_density', 'road_length', 'populartimes_peak_avg']
categorical_features = ['traffic_level', 'time_of_day']
target_col = 'vibe_class'

# Encode categorical + scale numeric using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)  # drop='first' to avoid dummy trap
])

X = preprocessor.fit_transform(df[numeric_features + categorical_features])
y = df[target_col].values  # Already label-encoded

# Run Boruta
forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5, random_state=42)
boruta = BorutaPy(estimator=forest, n_estimators='auto', random_state=42)
boruta.fit(X, y)

# Get all feature names from ColumnTransformer
ohe = preprocessor.named_transformers_['cat']
cat_feature_names = ohe.get_feature_names_out(categorical_features)
all_feature_names = numeric_features + list(cat_feature_names)

# Get selected features
selected = [name for name, keep in zip(all_feature_names, boruta.support_) if keep]
print("✅ Selected features by Boruta:", selected)


✅ Selected features by Boruta: ['dist_to_park', 'road_density', 'road_length', 'traffic_level_low', 'traffic_level_medium']


In [15]:
import pandas as pd

# === CONFIG ===
INPUT_CSV = "datasets/vibe_full_features_with_scraped_new_updated.csv"
SELECTED_FEATURES = selected
CATEGORICAL_FEATURES = ['traffic_level', 'time_of_day']
NUMERIC_FEATURES = ['dist_to_park', 'road_density', 'road_length']
ESSENTIAL_COLUMNS = ['vibe_class', 'sentinel_jpg_path', 'osm_image_path']
OUTPUT_CSV = "datasets/vibe_features_selected.csv"

# === Load Data ===
df = pd.read_csv(INPUT_CSV)

# === Reapply Encoding ===
encoded_df = pd.get_dummies(df[CATEGORICAL_FEATURES], drop_first=False)
numeric_df = df[NUMERIC_FEATURES]
combined_df = pd.concat([numeric_df, encoded_df], axis=1)

# === Sanity check for missing features ===
missing = [col for col in SELECTED_FEATURES if col not in combined_df.columns]
if missing:
    print(f"⚠️ Warning: Missing selected features: {missing}")

# === Filter selected features only ===
final_features = combined_df[SELECTED_FEATURES]

# === Add back essential columns ===
final_df = pd.concat([final_features, df[ESSENTIAL_COLUMNS]], axis=1)

# === Save Final CSV ===
final_df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Saved: {OUTPUT_CSV}")


✅ Saved: datasets/vibe_features_selected.csv


In [None]:
df[ESSENTIAL_COLUMNS]

KeyError: "['sentinel_png_path'] not in index"