In [3]:
!pip install geopandas





In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from shapash.explainer.smart_explainer import SmartExplainer
from sklearn.preprocessing import OneHotEncoder

# Load the CSV file
file_path = 'reduced_air_quality.csv'
data = pd.read_csv(file_path)

# Define features and label
features = data.drop(columns=['AQHI', 'GlobalID', 'OBJECTID', 'IN_DASHBOARD', 'QUERY_FIELD'])
label = data['AQHI']

# Identify categorical columns
categorical_cols = features.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical_data = pd.DataFrame(encoder.fit_transform(features[categorical_cols]))
encoded_categorical_data.columns = encoder.get_feature_names_out(categorical_cols)

# Drop the original categorical columns and concatenate the encoded columns
features = features.drop(columns=categorical_cols)
features = pd.concat([features.reset_index(drop=True), encoded_categorical_data.reset_index(drop=True)], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

# Train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Create a Shapash explainer
explainer = SmartExplainer(model=model)

# Compile the explainer with the test data
y_pred = model.predict(X_test)
if y_pred.ndim == 1:
    y_pred = y_pred.reshape(-1, 1)
explainer.compile(x=X_test, y_target=y_test)

# Display the explainer app (this will open a local web server to interact with the explanations)
explainer.run_app()


INFO:root:Your Shapash application run on http://MacBook-Pro-de-Javier.local:8050/
INFO:root:Use the method .kill() to down your app.


INFO: Shap explainer type - <shap.explainers._tree.TreeExplainer object at 0x16b18afc0>


<CustomThread(Thread-7 (<lambda>), started 6277722112)>

In [5]:
data['LOCATION'].value_counts()

LOCATION
Suddaby             80
Smithson            76
J. F. Carmichael    72
St. Bernadette      72
Victoria Park       68
Name: count, dtype: int64