In [None]:
import umap
import umap.plot
import hdbscan
import pandas as pd
import numpy as np

from bokeh.plotting import show as show_bokeh
umap.plot.output_notebook()

In [None]:
fake_data = pd.read_csv('data/synthetic_data.csv')

In [None]:
# Get the non zero values for each row
column_names = np.array(fake_data.columns)
non_zero_values = []

for row_tuple in fake_data.iterrows():
    row = row_tuple[1]
    non_zero_index = row.to_numpy().nonzero()
    values = ' | '.join(column_names[non_zero_index])
    non_zero_values.append(values)

In [None]:
%%time
embedding = umap.UMAP(metric='cosine').fit(fake_data)

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=20)
clusterer.fit(embedding.embedding_)

In [None]:
hover_df = pd.DataFrame(non_zero_values, columns=['supported_teams'])
hover_df['cluster'] = clusterer.labels_

In [None]:
hover_df['cluster'].value_counts()

In [None]:
f = umap.plot.interactive(embedding, labels=hover_df['cluster'], hover_data=hover_df)
show_bokeh(f)

### Try using interpret-clusters

In [None]:
from interpret_clusters import ClusterExplainer
from interpret import show

In [None]:
features = fake_data.values
cluster_labels = clusterer.labels_
feature_names = list(fake_data.columns)

### Use the default model which is an Explainable Boosting Machine

In [None]:
ce_ebm = ClusterExplainer(features=features, cluster_labels=cluster_labels,
                  feature_names=feature_names, verbose=True)

##### Look at the local explanations for a specific cluster

In [None]:
ebm_local = ce_ebm.cluster_local_explanations(0)
show(ebm_local)

##### Look at the global explanations for a specific cluster

In [None]:
ebm_global = ce_ebm.cluster_global_explanations(0)
show(ebm_global)

##### Train a model for each cluster and calculate all the local explanations

In [None]:
%%time
ce_ebm.calculate_all_local_explanations()

In [None]:
# Look at the explanations for a particular cluster
show(ce_ebm.cluster_local_explanations(0))

##### Train a model for each cluster and calculate all the global explanations

In [None]:
%%time
ce_ebm.calculate_all_global_explanations()

In [None]:
# Look at the explanations for a particular cluster
show(ce_ebm.cluster_global_explanations(1))

### Try using a logistic_regression model

In [None]:
ce_lr = ClusterExplainer(features=features, cluster_labels=cluster_labels,
                  feature_names=feature_names, 
                  classifier='logistic_regression')

In [None]:
show(ce_lr.cluster_local_explanations(0))

### Pass in a custom classifer

In [None]:
from interpret.glassbox import LogisticRegression

In [None]:
lr = LogisticRegression(random_state=42, feature_names=feature_names, penalty='l1', solver='liblinear')

ce_custom = ClusterExplainer(features=features, cluster_labels=cluster_labels,
                  feature_names=feature_names, 
                  classifier=lr)

In [None]:
show(ce_custom.cluster_global_explanations(1))