# Clustering of Cities - interactive demo
v1.0

Authors: Maria Ricci, Manuel Löhnertz (space4environment), Mohamed-Bachir Belaid (NILU)

---
### Scope
Run clustering analysis of ~700 European cities based on land, climate and socioeconomic features. Tune the different parameters to get the optimal result.
### Data
The collection of features is available in the `data/city_features_collection` folder in `geojson` format.
### Requirements
Required packages for analysis: `sklearn`, `numpy`.

Required packages for visualization: `ipywidgets`,`ipyleaflet`,`plotly`.

Additionally install the repository library `src` with `pip install -e .` from the root folder.


In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans

# for notebook interaction
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import ipywidgets as widgets
# from ipywidgets import interact, fixed, interactive
# from IPython.display import display

## Load data

Get data from database. 

Requirements: 
- `src` package installed in your conda environment. To install it, run `pip install -e .` from the root folder
- `database.ini` configuration file to connect to the database. Alternatively, set up your own data source

In [2]:
import pkg_resources
import os
import sqlalchemy

required = {'src'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed
if missing:
    print('src package is not installed')
else:
    from src import db_connect
    home_dir = os.environ.get('HOME') # set here your base directory/root folder
    engine_postgresql = db_connect.create_engine(db_config = f"{home_dir}/uc1-urban-climate/database.ini")

    with engine_postgresql.begin() as conn:
        query = sqlalchemy.text("""SELECT 
                    urau_code, urau_name, _wgs84x, _wgs84y, ez_code, city_area_ha, dem_mean,
                    imd_percent_2018, treecover_percent_2018,
                    class_11100, class_11210, class_11220, class_11230,
                    class_11240, class_11300, class_12100, class_12210,
                    class_12220, class_12230, class_12300, class_12400,
                    class_13100, class_13300, class_13400, class_14100,
                    class_14200, class_21000, class_22000, class_23000,
                    class_24000, class_25000, class_31000, class_32000,
                    class_33000, class_40000, class_50000, urban_blue_percent,
                    urban_green_percent, avg_2m_temp_kelvin_2018,
                    number_of_summer_days_2018, number_of_tropical_nights_2018,
                    utci_heat_nights_2018, coastal_city, de1001v_2018, de1028v_2018,
                    de1055v_2018, ec1174v_2018, ec1010v_2018, ec1020i_2018,
                    ec3040v_2018, sa2013v_2018, de1028i_2018, de1055i_2018
                    FROM public.city_2018_demo_view;""")
        df = pd.read_sql_query(query, conn)

You can use the FAIRiCUBE Data Dashboards to explore the dataset:
- [City fact sheet](https://grafana.space4environment.com:3000/d/b662e3cd-7399-44ab-a127-fd7c263c35c3/city-fact-sheet?orgId=1&var-country=LU&var-city=Luxembourg&var-latitude=49.6135785281&var-longitude=6.1264388014&from=1514764800000&to=1546214400000)
- [Indicators dashboard](https://grafana.space4environment.com:3000/d/f44d39f0-eba5-41ea-a3c1-262ee1be9521/indicators-dashboard?orgId=1)

In [3]:
fig = go.Figure(data=[go.Table(
    header=dict(values=["City name", "area_ha", "dem_mean", "ez_code", "imd_percent_2018", "treecover_percent_2018", "avg_2m_temp_kelvin_2018", "class_50000"]),
    cells=dict(values=[df.urau_name, df.city_area_ha, df.dem_mean, df.ez_code, df.imd_percent_2018, df.treecover_percent_2018, df.avg_2m_temp_kelvin_2018, df.class_50000]))
])

print(f'There are {df.shape[0]} cities and {df.shape[1]} features')
fig.show()

There are 729 cities and 53 features


## Data preparation
First check how many missing values there are for each feature

In [4]:
df_gaps = df.isna().sum()

fig = go.Figure(data=[go.Table(
    header=dict(values=["Attribute", "Count of NoData", "Mean", "Variance"]),
    cells=dict(values=[df.columns[5:], df_gaps[5:], df.mean(axis=0, numeric_only=True)[2:], df.var(axis=0, numeric_only=True)[2:]]))
])

fig.show()

### Treat missing values
There are two methods to treat missing values:
- `remove` the cities with missing values
- `impute` the missing values with a computed value, e.g. the average

If a feature has too many missing values, imputation does not work, and removing rows leads to a significant dataset reduction. In this case, we remove the feature


### Treat categorical features
There are three methods to deal with categorical features:
- `label_encoding` assigns a numerical value to each category. #TODO explain pro/contra
- `binary_encoding` turns each category into a new binary feature #TODO explain pro/contra
- `remove` the categorical feature altogether. May be used later for result interpretation

### Correlation analysis
Simple correlation anaylsis of the numerical features. When two features are highly correlated, there is no need to keep them both for cluster analysis. We remove features that have a correlation >0.9 (<-0.9 resp.) to another feature.

In [8]:
# correlation analysis
# https://plotly.com/python/figurewidget-app/
def remove_features(df, min_cities = 100):
    df_gaps = df.isna().sum()
    columns_to_drop = df_gaps[df_gaps>min_cities].index.to_list()
    print("Dropping the following features", columns_to_drop)
    new_df = df.drop(columns=columns_to_drop)
    return new_df

def treat_missing_values(df, method='remove'):
    if method=='remove':
        df_len = len(df)
        new_df = df.dropna(axis='index')
        new_df_len = len(new_df)
        print(f"Removed {df_len-new_df_len} cities")
    else:
        new_df = df.fillna(df.mean(numeric_only=True))
    return new_df

def treat_cat_features(df, method='binary_encoding', feature='ez_code'):
    if method=='binary_encoding':
        new_df = pd.get_dummies(df, columns=[feature])
    elif method=='label_encoding':
        le = LabelEncoder()
        le.fit(df[feature])
        new_df = df
        new_df[feature] = le.transform(df[feature])
    else:
        new_df = df.drop(columns=[feature])
    return new_df

def get_correlated_pairs(df, threshold=0.9):
    correlation_matrix = df.corr(numeric_only=True)

    # create mask to display only upper left triangle 
    mask = np.zeros_like(correlation_matrix, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    cm_abs = np.abs(correlation_matrix.mask(mask))
    correlated_attrs = cm_abs.where(cm_abs>threshold).stack()
    correlated_pairs = correlated_attrs.index.to_list()
    val1 = list(set([val for val,_ in correlated_pairs]))
    print("Removed the following correlated feautures", val1)
    print("--------------")
    new_df = df.drop(columns=val1)
    return correlation_matrix, mask, correlated_pairs, new_df

def data_preparation(df, min_cities, missing_val_method, cat_features_method, cat_feature_name, threshold):
    df1 = remove_features(df, min_cities)
    df2 = treat_missing_values(df1, missing_val_method)
    df3 = treat_cat_features(df2, cat_features_method, cat_feature_name)
    correlation_matrix, mask, correlated_pairs, new_df = get_correlated_pairs(df3, threshold)

    return correlation_matrix, mask, correlated_pairs, new_df

# define controllers
style = {'description_width': 'initial'}
remove_features_controller = widgets.IntSlider(value=100, min=50, max=400, step=50, continuous_update=False, description="Min cities", style=style)
missing_val_controller = widgets.Dropdown(
    options=['remove', 'impute'],
    value='remove',
    description='Treat missing values in rows',
    disabled=False,
    style=style
)
categorical_controller = widgets.Dropdown(
    options=['label_encoding', 'binary_encoding', 'remove'],
    value='binary_encoding',
    description='Treat categorical features',
    disabled=False, 
    style=style
)
correlation_features_controller = widgets.FloatSlider(value=0.9, min=0.8, max=0.99, step=0.01, description="Threshold")
controller_container = widgets.VBox([remove_features_controller, missing_val_controller, categorical_controller, correlation_features_controller])
# initialize the figure
correlation_matrix, mask, correlated_pairs, new_df = data_preparation(df, 100, 'remove', 'binary_encoding', 'ez_code', 0.9)
pio.templates.default = "plotly_white"
layout = go.Layout(
    autosize=False,
    width=1000,
    height=1000,
    margin=go.layout.Margin(l=50, r=50, b=100, t=100, pad=4),
)
fig = go.FigureWidget(
    data=go.Heatmap(
    z=correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale=px.colors.diverging.RdBu,
    zmin=-1,
    zmax=1,
),
layout=layout)

# bind controllers and figure
new_df_list = [new_df]
def response(change):
    print("updating")
    df1 = remove_features(df, remove_features_controller.value)
    df2 = treat_missing_values(df1, missing_val_controller.value)
    df3 = treat_cat_features(df2, categorical_controller.value, 'ez_code')
    correlation_matrix, mask, correlated_pairs, new_df = get_correlated_pairs(df3, correlation_features_controller.value)
    with fig.batch_update():
        fig.data[0].z = correlation_matrix
        fig.data[0].x = correlation_matrix.columns
        fig.data[0].y = correlation_matrix.columns
    new_df_list.append(new_df)

remove_features_controller.observe(response, names="value")
missing_val_controller.observe(response, names="value")
categorical_controller.observe(response, names="value")
correlation_features_controller.observe(response, names="value")
figure1 = widgets.VBox([controller_container, fig])

Dropping the following features ['ec1174v_2018', 'ec1020i_2018', 'ec3040v_2018', 'sa2013v_2018']
Removed 69 cities
Removed the following correlated feautures ['de1028v_2018', 'ec1010v_2018', 'de1055v_2018', 'class_12220', 'class_12100', 'de1001v_2018']
--------------


updating
Dropping the following features ['ec1174v_2018', 'ec1020i_2018', 'ec3040v_2018', 'sa2013v_2018']
Removed 69 cities
Removed the following correlated feautures ['de1028v_2018', 'ec1010v_2018', 'de1055v_2018', 'class_12220', 'class_12100', 'de1001v_2018']
--------------


In [9]:
figure1

VBox(children=(VBox(children=(IntSlider(value=100, continuous_update=False, description='Min cities', max=400,…

In [11]:
print(new_df_list[-1].shape)
print(new_df_list[-1].columns)

(660, 42)
Index(['urau_code', 'urau_name', '_wgs84x', '_wgs84y', 'city_area_ha',
       'dem_mean', 'imd_percent_2018', 'treecover_percent_2018', 'class_11100',
       'class_11210', 'class_11220', 'class_11230', 'class_11240',
       'class_11300', 'class_12210', 'class_12230', 'class_12300',
       'class_12400', 'class_13100', 'class_13300', 'class_13400',
       'class_14100', 'class_14200', 'class_21000', 'class_22000',
       'class_23000', 'class_24000', 'class_25000', 'class_31000',
       'class_32000', 'class_33000', 'class_40000', 'class_50000',
       'urban_blue_percent', 'urban_green_percent', 'avg_2m_temp_kelvin_2018',
       'number_of_summer_days_2018', 'number_of_tropical_nights_2018',
       'utci_heat_nights_2018', 'coastal_city', 'de1028i_2018',
       'de1055i_2018'],
      dtype='object')


## Clustering
Once the data have been cleaned, we can start the clustering. Tune the different clustering settings to get the optimal result.
It is possible to change the following parameters:
- Feature normalization: choose between `min_max` or `normal` scaling.
- Clustering method: choose between classic `k-means clustering` or `weighted k-means clustering`. In the latter case, you must define the weights for each feature. The weights are used when updating the cluster centroids.
- Optimal number of clusters: use the elbow method to determine what is the optimal number of clusters. The figure below charts the clusters inertia versus the number of clusters. The optimal number of cluster is the one that minimizes the inertia as well as the number of clusters. The red dot on the graph indicates the current choice of number of clusters (`k`)

In [170]:
#Selecting features (From 0 to 4 are cities infos, e.g.s city code)
# print(new_df.columns)
new_df = new_df_list[-1]
final_df = new_df[
    ['urau_code', 'urau_name', '_wgs84x', '_wgs84y', 'city_area_ha',
       'dem_mean', 'imd_percent_2018', 'treecover_percent_2018', 
    #    'class_11100',
    #    'class_11210', 'class_11220', 'class_11230', 'class_11240',
    #    'class_11300', 'class_12100', 'class_12210', 'class_12220',
    #    'class_12230', 'class_12300', 'class_12400', 'class_13100',
    #    'class_13300', 'class_13400', 'class_14100', 'class_14200',
    #    'class_21000', 'class_22000', 'class_23000', 'class_24000',
    #    'class_25000', 'class_31000', 'class_32000', 'class_33000',
    #    'class_40000', 'class_50000', 
       'urban_blue_percent', 'urban_green_percent', 'avg_2m_temp_kelvin_2018',
       'number_of_summer_days_2018', 'number_of_tropical_nights_2018',
       'utci_heat_nights_2018', 
      #  'coastal_city', 
    #    'de1001v_2018', 'ec1010v_2018','ec1020i_2018',
        'de1028i_2018', 'de1055i_2018']]
# discard non-numeric features
features = final_df.iloc[:, 4:]

In [185]:
# Convert codes to names
code_to_name = dict({"ez_code":	"environmental zone code",
"imd_percent_2018": "share of sealed area",
"treecover_percent_2018":	"share of area covered by trees",
"class_11100":"Continuous Urban Fabric (S.L. > 80%)",
"class_11210":"Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)",
"class_11220":"Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)",
"class_11230":"Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)",
"class_11240":"Discontinuous Very Low Density Urban Fabric (S.L. < 10%)",
"class_11300":"Isolated Structures",
"class_12100":"Industrial; commercial; public military and private units",
"class_12210":"Fast transit roads and associated land",
"class_12220":"Other roads and associated land",
"class_12230":"Railways and associated land",
"class_12300":"Port areas",
"class_12400":"Airports",
"class_13100":"Mineral extraction and dump sites",
"class_13300":"Construction sites",
"class_13400":"Land without current use",
"class_14100":"Green urban areas",
"class_14200":"Sports and leisure facilities",
"class_21000":"Arable land (annual crops)",
"class_22000":"Permanent crops (vineyards; fruit trees; olive groves)",
"class_23000":"Pastures",
"class_24000":"Complex and mixed cultivation patterns",
"class_25000":"Orchards",
"class_31000":"Forests",
"class_32000":"Herbaceous vegetation association",
"class_33000":"Open spaces with little or no vegetation",
"class_40000":"Wetland",
"class_50000":"Water bodies",
"de1001v_2018":	"Population on the 1st of January, total ",
"de1028v_2018":	"Population on the 1st of January, 65-74 years, total ",
"de1055v_2018":	"Population on the 1st of January, 75 years and over, total ",
"ec1174v_2018":	"Economically active population, 20-64, total ",
"ec1010v_2018":	"Persons unemployed, total ",
"ec1020i_2018":	"Unemployment rate ",
"ec3040v_2018":	"Average disposable annual household income - EUR ",
"sa2013v_2018":	"Number of deaths per year under 65 due to diseases of the circulatory or respiratory systems ",
"de1028i_2018":	"Proportion of population aged 65-74 years ",
"de1055i_2018":	"Proportion of population aged 75 years and over"})
final_df.rename(columns = code_to_name, inplace=True)

In [173]:
print(final_df.shape)
print(features.shape)

(660, 16)
(660, 12)


### WeightedKmean, revised Kmeans Algorithm to consider the weight 
The weights are considered during the update of centroids

In [188]:
from src.weighted_kmeans import WeightedKMeans
# ## initialize all weights to 1
feature_weights = [1]*len(features.columns)
# ## change weight of selected features
weight = 1.5
idx = features.columns.get_loc('treecover_percent_2018')
feature_weights[idx] = weight
# ## Normalize weights
feature_weights /= np.sum(feature_weights)


In [189]:
# clustering
#TODO add select multiple widget
import warnings
warnings.filterwarnings('ignore')

k_max = 30
def scaler(features, method='min_max'):
    if method=='min_max':
        scaler_f = MinMaxScaler(feature_range=(0, 1))
    else:
        scaler_f = StandardScaler()
    norm_df = pd.DataFrame(scaler_f.fit_transform(features), columns=features.columns, index=features.index)
    return norm_df, scaler_f
def clustering(normalized_features, method='normal', k=8, feature_weights=None):
    if(method=='normal'):
        kmeans = KMeans(n_init=10)
        kmeans.random_state=42
        kmeans.n_clusters = k
        kmeans.max_iter = 100
        kmeans.fit(normalized_features)
    else:
        kmeans = WeightedKMeans()
        kmeans.random_state=42
        kmeans.n_clusters = k
        kmeans.max_iter = 100
        kmeans.fit(normalized_features, feature_weights)
    return kmeans
def elbow_method(normalized_features, k_max=k_max, method='normal', feature_weights=None):
    inertia = []
    k_range = range(1, k_max)
    if(method=='normal'):
        kmeans = KMeans(n_init=10)
        kmeans.random_state=42
        for k in k_range:
            kmeans.n_clusters = k
            kmeans.fit(normalized_features)
            inertia.append(kmeans.inertia_)
    else:
        kmeans = WeightedKMeans()
        kmeans.random_state=42
        for k in k_range:
            kmeans.n_clusters = k
            kmeans.fit(normalized_features, feature_weights)
            inertia.append(kmeans.inertia_)
    return inertia

#set up controllers
k_controller = widgets.IntSlider(value=8, min=1, max=k_max-1, step=1, description="Optimal k", continuous_update=False, style=style)
norm_method_controller = widgets.Dropdown(
    options=['min_max', 'standard'],
    value='min_max',
    description='Choose normalization method',
    disabled=False, 
    style=style
)
cluster_method_controller = widgets.Dropdown(
    options=['normal', 'weighted'],
    value='normal',
    description='Choose clustering method',
    disabled=False, 
    style=style
)

controller_container2 = widgets.VBox([norm_method_controller, cluster_method_controller, k_controller])
# initialize the figure
norm_df,fitted_scaler = scaler(features, method='min_max')
normalized_features = norm_df.values
inertia = elbow_method(normalized_features, method='normal')
cluster_df = final_df[['urau_code', 'urau_name', '_wgs84x', '_wgs84y']]
cluster_df['cluster'] =  clustering(normalized_features, 
                            method=cluster_method_controller.value, 
                            k=k_controller.value, feature_weights=feature_weights).labels_
layout = go.Layout(
    autosize=False,
    width=500,
    height=500,
    margin=go.layout.Margin(l=50, r=50, b=100, t=100, pad=4),
)
fig2 = go.FigureWidget(
    data=go.Scatter(name="Inertia", x=list(range(1, k_max)), y=inertia),
    layout=layout)
fig2.add_trace(go.Scatter(name="Number of clusters", x=[8], y=[inertia[7]], mode='markers'))
fig2.update_xaxes(title_text='number of clusters')
fig2.update_yaxes(title_text='inertia')
# bind controllers and figure
cluster_df_list = [cluster_df]
def response2(change):
    print("start update")
    norm_df, _ = scaler(features, method=norm_method_controller.value)
    normalized_features = norm_df.values
    inertia2 = elbow_method(normalized_features, method=cluster_method_controller.value, feature_weights=feature_weights)
    cluster_df['cluster'] =  clustering(normalized_features, 
                                method=cluster_method_controller.value, 
                                k=k_controller.value, feature_weights=feature_weights).labels_
    with fig2.batch_update():
        fig2.data[0].y = inertia2
        fig2.data[1].x = [k_controller.value]
        fig2.data[1].y = [inertia2[k_controller.value-1]]
    cluster_df_list.append(cluster_df)
    print("finish update")

norm_method_controller.observe(response2, names="value")
cluster_method_controller.observe(response2, names="value")
k_controller.observe(response2, names="value")
figure2 = widgets.VBox([controller_container2, fig2])

start update
finish update


In [190]:
figure2

VBox(children=(VBox(children=(Dropdown(description='Choose normalization method', options=('min_max', 'standar…

## Results interpretation
### Geographical interpretation

In [221]:
from ipyleaflet import Map, GeoData, LegendControl
from ipywidgets import HTML
import geopandas

m = Map(center=(46.91, 7.43), zoom=4, layout=widgets.Layout(height='700px'))
cmap_hex = ["#FF5733", "#C70039", "#900C3F", "#581845", "#FFC300", "#DAF7A6", "#884EA0", "#2471A3", "#2E4053", "#1B4F72", "#186A3B", "#A569BD", "#5D6D7E", "#AEB6BF", "#212F3D", "#283747", "#1B2631", "#515A5A", "#17202A", "#6E2C00"]

cluster_df_final = cluster_df_list[-1]
gdf = geopandas.GeoDataFrame(
    cluster_df_final, geometry=geopandas.points_from_xy(cluster_df_final._wgs84x, cluster_df_final._wgs84y))
for idx in range(0, np.max(cluster_df_final.cluster)+1):
    geo_data = GeoData(geo_dataframe = gdf[gdf.cluster==idx],
        point_style={'color': cmap_hex[idx], 'fillColor': cmap_hex[idx], 'radius': 5, 'fillOpacity': 0.8,'weight': 3},
        name = 'Release')
    message = HTML()
    message.value = f'Cluster #{idx}'
    geo_data.popup = message
    m.add(geo_data)

legend = LegendControl(dict(zip(range(k_controller.value), cmap_hex[:k_controller.value])), title="Legend", position="bottomright")
legend.title = "Clusters"
m.add(legend)
m

Map(center=[46.91, 7.43], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_ou…

### Transformed feature space
How 'good' are cities separated?

In [192]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3, random_state=42)
tsne_features = tsne.fit_transform(normalized_features)
# # Create a 3D scatter plot for t-SNE visualization
layout = go.Layout(
    showlegend=True,
    scene=go.Scene(
        xaxis=go.XAxis(title='t-SNE Feature 1'),
        yaxis=go.YAxis(title='t-SNE Feature 2'),
        zaxis=go.ZAxis(title='t-SNE Feature 3')
    ),
    margin=go.layout.Margin(l=10, r=10, b=10, t=10, pad=4),
    height=500
)

figure3 = go.Figure(layout=layout)

for cluster_label, color in zip(range(k_controller.value), cmap_hex[:k_controller.value]):
    # Filter data points belonging to the current cluster
    cluster_indices = np.where(cluster_df_final.cluster == cluster_label)[0]
    figure3.add_trace(go.Scatter3d(name = f"Cluster {cluster_label}", x=tsne_features[cluster_indices, 0], y=tsne_features[cluster_indices, 1], z=tsne_features[cluster_indices, 2],
                                   mode='markers',
                                   marker=dict(size=5,
                                               color=color,
                                               opacity=0.5,)))

figure3.show()

### Cluster "profile"

In [235]:

norm_df["cluster"] = cluster_df_final["cluster"]
norm_df["urau_name"] = cluster_df_final["urau_name"]
norm_df.rename(columns=code_to_name, inplace=True)
def make_boxplot(fig, df, ncluster):
    fig.data = []
    df_s = df[df.cluster == ncluster]
    for col in df_s.columns[:-2]:
        fig.add_trace(go.Box(
            y=df_s[col],
            name=col,
            line = {"color":"blue"},
        ))
    fig.update_xaxes(title_text=f'Cluster #{ncluster}')

ncluster_controller = widgets.Dropdown(
    options=range(k_controller.value),
    value=0,
    description='Choose cluster',
    disabled=False, 
    style=style
)

layout = go.Layout(
    # autosize=False,
    # width=500,
    height=800,
    margin=go.layout.Margin(l=50, r=50, b=100, t=100, pad=4),
)
fig4 = go.FigureWidget(layout=layout)

make_boxplot(fig4, norm_df, ncluster_controller.value)
fig4.update_layout(showlegend=False)
# bind controllers and figure
def response4(change):
    print("start update")
    make_boxplot(fig4, norm_df, ncluster_controller.value)
    print("finish update")

ncluster_controller.observe(response4, names="value")
figure4 = widgets.VBox([ncluster_controller, fig4])

start update
finish update
start update
finish update
start update
finish update
start update
finish update


In [236]:
figure4

VBox(children=(Dropdown(description='Choose cluster', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), styl…

### Most influencing features
Which feature has the most influence on each cluster?

In [216]:
from src.kmeans_feature_imp import KMeansInterp
kmeans_I = KMeansInterp(
        n_clusters=k_controller.value,
        ordered_feature_names=norm_df.columns[:-2].tolist(),
        max_iter=300,
        feature_importance_method='wcss_min',  # or 'unsup2sup'
    ).fit(normalized_features)

In [217]:
fi_df = pd.DataFrame(index=norm_df.columns[:-2])
for cluster in kmeans_I.feature_importances_:
    idxs, vals = kmeans_I.feature_importances_[cluster]
    tmp_df = pd.DataFrame(vals, index=idxs, columns=[cluster])
    fi_df = pd.merge(fi_df, tmp_df, left_index=True, right_index=True)

In [226]:
layout = go.Layout(
    # autosize=False,
    # width=1000,
    height=len(fi_df.columns)*30,
    margin=go.layout.Margin(l=10, r=10, b=10, t=10, pad=4),
    xaxis_nticks=k_controller.value*2
)
fig = go.Figure(
    data=go.Heatmap(
    z=fi_df.to_numpy(),
    x=fi_df.columns,
    y=fi_df.index,
    zmin=0,
    zmax=1,
    colorscale=px.colors.sequential.Blues
),
layout=layout)
fig.update_xaxes(side="top")
fig.show()