In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

In [7]:
vars = 5

In [8]:
ds_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'

df_DNA = pd.read_csv(ds_path)

In [9]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plotClustersRadar(df, column, columns, save_path=None, plot_cols=5, row_height=300):
  titles = [f"{i}" for i in df.index]
  cols = plot_cols
  rows = int(np.ceil(len(df.index)/cols))
  #print(f"rows: {rows}, cols: {cols}")
  fig = make_subplots(rows=rows, cols=cols,
                      specs=[[{'type': 'polar'}]*cols]*rows,
                      horizontal_spacing=0.5/cols,
                      vertical_spacing=0.3/rows,
                      subplot_titles=titles,
                      )
  polar_args = {}
  for i in range(rows):
    for j in range(cols):
      #print(f"i: {i+1}, j: {j+1}")
      if i*cols+j < len(df.index):
        fig.add_trace(go.Scatterpolar(
                            r=df.iloc[i*cols+j],
                            theta=columns,
                            fill='toself',
                            name=f'Class {df.index[i*cols+j]}'), 
                        row=i+1, col=j+1
                      )
        polar_args[f"polar{i*cols+j+1}"] = dict(
                                              radialaxis=dict(
                                                visible=True,
                                                range=[0.0, 1.0]
                                              )
                                            )
  fig.update_layout(
      height=row_height*rows,
      showlegend=False,
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  fig.show()

# Entire dataset

In [10]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

In [14]:
algo='HC'
metric = 'VDM'
std = True
if metric != "HVDM":
  std = False
if_std = '_std' if std else ''
vars = 6
regions = False
if vars != 7:
  regions = False
if_regions = "_regions" if regions else ""

base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"

radars_path = f'{pictures_path}RadarCharts/'

In [15]:
# may cause RAM issues
distance_matrix = np.load(f"/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/Data/distance_matrix_{metric}{vars}{if_regions}{if_std}.npy")

## Get cluster labels

In [16]:
from metrics import silhouette
import importlib
importlib.reload(silhouette)

<module 'metrics.silhouette' from '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/metrics/silhouette.py'>

In [17]:
from metrics.silhouette import getSilhouette

def postprocessClusterLabels(distance_matrix, metric, link, k, postprocessing=False):
  file_type = ".npy"
  filename = f"{algo}_{metric}_{link}{file_type}"
  HC_path = f'{HC_base_path}{metric}{vars}{if_regions}{if_std}/{filename}'

  Z = {}
  Z[metric] = np.load(HC_path)

  cluster_labels = fcluster(Z[metric], k , criterion='maxclust')
  silhouette_avg, sample_silhouette_values, cluster_labels = getSilhouette(distance_matrix, cluster_labels, postprocessing)

  return silhouette_avg, sample_silhouette_values, cluster_labels

In [18]:
df_DNA_clusters = df_DNA.copy()

# Radar Chart of Entire dataset wrt target

In [19]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_filtered_values.csv'

df = pd.read_csv(ds_path)

In [20]:
cleanup_nums = {"Concern_environmental_impacts":
                  {
                      '1': 1,
                      '2': 2,
                      '3': 3,
                      '4': 4,
                      '5': 5,
                      '6': 6,
                      '7': 7,
                      '8': 8,
                      '9': 9,
                      '10': 10,
                      "Don't know": 0,
                  },
                "grouped_Age":
                  {
                      '16:23': 19.5,
                      '23:27': 25,
                      '27:30': 28.5,
                      '30:35': 32.5,
                      '35:39': 37,
                      '39:43': 41,
                      '43:47': 45,
                      '47:51': 49,
                      '51:54': 52.5,
                      '54:85': 69.5
                  },
                "Would_subsribe_car_sharing_if_available":
                  {
                      "Don't know / No answer": 0,
                      'No, I would not be interested in this service': 1,
                      'Maybe yes, maybe not. I would need to test the service before taking a decision': 2,
                      'Yes without any influence on my car ownership': 3,
                      'Yes, instead of purchasing a new car': 4,
                      'Yes and I would give up one car I currently own': 5,
                      "Yes I'm already client of a car sharing service": 6
                  },
                "Preference_tolls_or_traffic_limitation":
                  {
                      'No preferences': 0,
                      'Probably more acceptable to limit road traffic': 1,
                      'Probably more acceptable to pay for less congestion': 2,
                      'Definitely more acceptable to pay for less congestion': 3,
                      'Definitely more acceptable to limit road traffic': 4
                  },
                "Gender":
                  {
                      'Female': 0,
                      'Male': 1
                  },
                #"Considering_electric_or_hybrid_vehicle_next_purchase":
                #  {
                #      "Don't know/no answer":0,
                #      'Certainly not':1,
                #      'Probably not':2,                   
                #      'Maybe yes maybe not':3,
                #      'Probably yes':4,
                #      'Certainly yes':5
                #  }
                }

In [21]:
categorical_map = {"Would_subsribe_car_sharing_if_available":
                  {
                      "Don't know / No answer": 0,
                      'No, I would not be interested in this service': 1,
                      'Maybe yes, maybe not. I would need to test the service before taking a decision': 2,
                      'Yes without any influence on my car ownership': 3,
                      'Yes, instead of purchasing a new car': 4,
                      'Yes and I would give up one car I currently own': 5,
                      "Yes I'm already client of a car sharing service": 6
                  },
                "Preference_tolls_or_traffic_limitation":
                  {
                      'No preferences': 0,
                      'Probably more acceptable to limit road traffic': 1,
                      'Probably more acceptable to pay for less congestion': 2,
                      'Definitely more acceptable to pay for less congestion': 3,
                      'Definitely more acceptable to limit road traffic': 4
                  }}


In [22]:
df_DNA_clusters.columns

Index(['InternetUsers', 'Concern_environmental_impacts',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Country'],
      dtype='object')

In [23]:
numerical_cols = ["InternetUsers", "Concern_environmental_impacts", "grouped_Age", "Gender"]
categorical_cols = ["Would_subsribe_car_sharing_if_available", "Preference_tolls_or_traffic_limitation"]
additioncal_categorical_cols = ["grouped_Region_3", "Country"]
target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
country_region_answers = df[additioncal_categorical_cols]
df = df[numerical_cols+categorical_cols+[target_col]]
df_numerical = df[numerical_cols]

In [24]:
reverse_categorical_map = {}
for key in categorical_cols:
  reverse_categorical_map[key] = {value : key for (key, value) in categorical_map[key].items()}
reverse_categorical_map

{'Preference_tolls_or_traffic_limitation': {0: 'No preferences',
  1: 'Probably more acceptable to limit road traffic',
  2: 'Probably more acceptable to pay for less congestion',
  3: 'Definitely more acceptable to pay for less congestion',
  4: 'Definitely more acceptable to limit road traffic'},
 'Would_subsribe_car_sharing_if_available': {0: "Don't know / No answer",
  1: 'No, I would not be interested in this service',
  2: 'Maybe yes, maybe not. I would need to test the service before taking a decision',
  3: 'Yes without any influence on my car ownership',
  4: 'Yes, instead of purchasing a new car',
  5: 'Yes and I would give up one car I currently own',
  6: "Yes I'm already client of a car sharing service"}}

In [25]:
categorical_map_abbreviated = {"Would_subsribe_car_sharing_if_available":
                {
                    "Don't know / No answer": "Don't know",
                    'No, I would not be interested in this service': "No",
                    'Maybe yes, maybe not. I would need to test the service before taking a decision': "Maybe, test",
                    'Yes without any influence on my car ownership': "Yes, no car influence",
                    'Yes, instead of purchasing a new car': "Yes, no new car",
                    'Yes and I would give up one car I currently own': "Yes, give up car",
                    "Yes I'm already client of a car sharing service": "Yes, already client"
                },
              "Preference_tolls_or_traffic_limitation":
                {
                    'No preferences': "No pref.",
                    'Probably more acceptable to limit road traffic': "Prob. limit traffic",
                    'Probably more acceptable to pay for less congestion': "Prob. pay",
                    'Definitely more acceptable to pay for less congestion': "Def. pay",
                    'Definitely more acceptable to limit road traffic': "Def. limit traffic"
                }
}

In [26]:
df = df.replace(cleanup_nums)

In [27]:
x = df[categorical_cols].values
categorical_min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = categorical_min_max_scaler.fit_transform(x)
df_categorical = pd.DataFrame(x_scaled)
df_categorical.columns = categorical_cols
df_categorical

Unnamed: 0,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation
0,0.333333,0.00
1,0.333333,0.25
2,0.166667,0.00
3,0.000000,0.50
4,0.333333,0.00
...,...,...
26600,0.666667,0.50
26601,0.000000,0.00
26602,0.333333,0.00
26603,0.666667,0.25


In [28]:
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country
0,0.758621,0.4,0.333333,0.50,0.037037
1,0.689655,0.8,0.333333,0.75,0.333333
2,0.551724,0.6,0.000000,0.50,0.185185
3,0.931034,0.8,0.166667,0.25,1.000000
4,0.327586,0.8,0.333333,0.50,0.777778
...,...,...,...,...,...
26600,0.551724,0.2,0.666667,0.25,0.148148
26601,0.551724,0.4,0.166667,0.50,0.148148
26602,0.551724,0.7,0.333333,0.50,0.148148
26603,0.551724,0.7,0.666667,0.75,0.148148


In [29]:
df_DNA_clusters[categorical_cols] = categorical_min_max_scaler.inverse_transform(df_DNA_clusters[categorical_cols])
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country
0,0.758621,0.4,2.0,2.0,0.037037
1,0.689655,0.8,2.0,3.0,0.333333
2,0.551724,0.6,0.0,2.0,0.185185
3,0.931034,0.8,1.0,1.0,1.000000
4,0.327586,0.8,2.0,2.0,0.777778
...,...,...,...,...,...
26600,0.551724,0.2,4.0,1.0,0.148148
26601,0.551724,0.4,1.0,2.0,0.148148
26602,0.551724,0.7,2.0,2.0,0.148148
26603,0.551724,0.7,4.0,3.0,0.148148


In [30]:
df_DNA_clusters = df_DNA_clusters.replace(reverse_categorical_map)

In [31]:
df_DNA_clusters = df_DNA_clusters.replace(categorical_map_abbreviated)

In [32]:
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country
0,0.758621,0.4,"Maybe, test",Prob. pay,0.037037
1,0.689655,0.8,"Maybe, test",Def. pay,0.333333
2,0.551724,0.6,Don't know,Prob. pay,0.185185
3,0.931034,0.8,No,Prob. limit traffic,1.000000
4,0.327586,0.8,"Maybe, test",Prob. pay,0.777778
...,...,...,...,...,...
26600,0.551724,0.2,"Yes, no new car",Prob. limit traffic,0.148148
26601,0.551724,0.4,No,Prob. pay,0.148148
26602,0.551724,0.7,"Maybe, test",Prob. pay,0.148148
26603,0.551724,0.7,"Yes, no new car",Def. pay,0.148148


In [33]:
df = df.replace(reverse_categorical_map).replace(categorical_map_abbreviated)

In [34]:
x = df[numerical_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_numerical = pd.DataFrame(x_scaled)
df_numerical.columns = numerical_cols

In [35]:
df[numerical_cols] = df_numerical
df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Gender,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Considering_electric_or_hybrid_vehicle_next_purchase
0,0.758621,0.5,0.59,0.0,"Maybe, test",No pref.,Maybe yes maybe not
1,0.689655,0.8,0.11,1.0,"Maybe, test",Prob. limit traffic,Probably not
2,0.551724,0.6,0.59,1.0,No,No pref.,Certainly not
3,0.931034,0.8,1.00,1.0,Don't know,Prob. pay,Maybe yes maybe not
4,0.327586,0.8,0.35,1.0,"Maybe, test",No pref.,Probably yes
...,...,...,...,...,...,...,...
26600,0.551724,0.3,0.18,0.0,"Yes, no new car",Prob. pay,Don't know/no answer
26601,0.551724,0.5,0.26,0.0,Don't know,No pref.,Don't know/no answer
26602,0.551724,0.7,0.26,1.0,"Maybe, test",No pref.,Maybe yes maybe not
26603,0.551724,0.7,0.35,1.0,"Yes, no new car",Prob. limit traffic,Probably yes


In [36]:
column = target_col
df_numerical[column] = df[column]

columns = ["IUsers", "EnvImpact", "Age", "Gender"]
df_DNA_grouped = df_numerical.groupby(column)
df_radar = df_DNA_grouped.mean()
filename = 'entire_dataset_target.html'
save_path = f'{radars_path}{filename}'
plot_cols = 3
row_height = 400
#plotClustersRadar(df_radar, column, columns, save_path, plot_cols, row_height)

# Plot radar and hist

In [37]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import tools
import matplotlib.cm as cm

def plotSilhouette(fig, n_clusters, silhouette_avg, sample_silhouette_values, cluster_labels, silhouette_row=1, silhouette_col=2):

  print("For n_clusters =", n_clusters,
        "The average silhouette_score is :", silhouette_avg)


  x_lower = 10

  for i in np.unique(cluster_labels):
      # Aggregate the silhouette scores for samples belonging to
      # cluster i, and sort them
      ith_cluster_silhouette_values = \
          sample_silhouette_values[cluster_labels == i]

      ith_cluster_silhouette_values.sort()

      size_cluster_i = ith_cluster_silhouette_values.shape[0]
      x_upper = x_lower + size_cluster_i

      #colors = plt.cm.Spectral(cluster_labels.astype(float) / n_clusters)
      
      filled_area = go.Scatter(x=np.arange(x_lower, x_upper),
                                y=ith_cluster_silhouette_values,
                                mode='lines',
                                name=str(i),
                                showlegend=True,
                                line=dict(width=0.5,
                                        #color=colors
                                        ),
                                fill='tozeroy')
      fig.add_trace(filled_area, silhouette_row, silhouette_col)
      
      # Compute the new y_lower for next plot
      x_lower = x_upper + 10  # 10 for the 0 samples

  # The 1st subplot is the silhouette plot
  # The silhouette coefficient can range from -1, 1 but in this example all
  # lie within [-0.1, 1]
  fig.update_yaxes(title_text='The silhouette coefficient values',
                   row=silhouette_row, col=silhouette_col,
                   range=[-1, 1])

  # The (n_clusters+1)*10 is for inserting blank space between silhouette
  # plots of individual clusters, to demarcate them clearly.
  fig.update_xaxes(title_text='Cluster label',
                   row=silhouette_row, col=silhouette_col,
                   #showticklabels=False,
                   range=[0, len(df_DNA) + (n_clusters + 1) * 10])

  # The vertical line for average silhouette score of all the values
  axis_line = go.Scatter(y=[silhouette_avg]*100,
                         x=np.linspace(0, len(df_DNA), 100,),
                         showlegend=True,
                         name='silhouette avg',
                         mode='lines',
                         line=dict(color="red", dash='dash',
                                   width =1) )

  fig.add_trace(axis_line, silhouette_row, silhouette_col)

  return fig
    

In [38]:
k = 11
link = 'weighted'
postprocessing = False
is_postprocessing = "_fix" if postprocessing else ""
silhouette_avg, sample_silhouette_values, cluster_labels = postprocessClusterLabels(distance_matrix, metric, link, k, postprocessing)
df_DNA_clusters["cluster"] = cluster_labels

column = "cluster"
df_DNA_clusters[target_col] = df[target_col]
df_DNA_clusters[categorical_cols] = df[categorical_cols]
df_DNA_clusters[additioncal_categorical_cols] = country_region_answers
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'silhouette_{link}_{k}clusters{is_postprocessing}.html'
save_path = f'{radars_path}{metric}{vars}{if_regions}/{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
plot_categorical_cols += [target_col]
sorted_cols = additioncal_categorical_cols

sil_title = f"k={k} clusters with silhouette average: {silhouette_avg}"
sil_fig = make_subplots(rows=1, cols=1, subplot_titles=[sil_title])
sil_fig = plotSilhouette(sil_fig, k, silhouette_avg, sample_silhouette_values, cluster_labels, silhouette_row=1, silhouette_col=1)
sil_fig.write_html(save_path)
sil_fig.show()

For n_clusters = 11 The average silhouette_score is : 0.22804085279536818
For n_clusters = 11 The average silhouette_score is : 0.22804085279536818


In [39]:
import plotly.express as px # for colors

def plot_radar(fig, polar_args, r, theta, i,):
  fig.add_trace(go.Scatterpolar(
                        r=r,
                        theta=theta,
                        showlegend=False,
                        fill='toself',
                        line_color=px.colors.qualitative.Plotly[i%len(px.colors.qualitative.Plotly)]), 
                    row=i+1, col=1
                  )
  polar_args[f"polar{i}"] = dict(radialaxis=dict(visible=True,
                                                      range=[0.0, 1.0]
                                                      )
  )

  return fig, polar_args

def plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors):

  sorted_answers = df[categorical_col].unique()
  if (categorical_col in to_sort_cols):
    top_answers = x[categorical_col].value_counts().index[:topN]
    x = x.loc[x[categorical_col].isin(top_answers)]
    sorted_answers = x[categorical_col].value_counts().index
  
  if (categorical_col in fixed_order_col_answers.keys()):
    sorted_answers = fixed_order_col_answers[categorical_col]
    
  #print(df_mean.index[sorted_index[i]])
  fig.add_trace(go.Histogram(x=x[categorical_col],
                              name=categorical_col,
                              histnorm='probability',
                              showlegend=False,
                              marker_color=marker_colors[j]), 
                row=i+1, col=j+2,
                )
  
  fig.update_yaxes(range=[0, 1], row=i+1, col=j+2)
  fig.update_xaxes(categoryorder="array", 
                    categoryarray=sorted_answers,
                    autorange=False,
                    tickangle=90,
                    tickfont=dict(size=fontsize),
                    row=i+1, col=j+2)
  return fig

def plot_radar_hist(df, group_col, numerical_cols, categorical_cols, 
                    to_sort_cols, fixed_order_col_answers, theta, save_path=None, 
                    row_height=300, topN=6, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 14

  # check if plot by clusters
  is_clusters = True if group_col == "cluster" else False

  # Group by group_col attribute
  sorted_index = df.groupby(group_col,as_index=False).count().sort_values(by=df.columns[0],ascending=False).index
  print(sorted_index)
  df_grouped = df.groupby(group_col)
  df_mean = df_grouped.mean()
  
  cluster_counts = df["cluster"].value_counts().to_numpy()
  print(f"cluser sizes: {cluster_counts}")

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+2

  k = len(df['cluster'].unique())
  # define the titles of each subplot
  titles = [f"k={k} clusters with silhouette average: {silhouette_avg}"]
  for ix, i in enumerate(sorted_index):
    #print(df_mean.index[i])
    cluster_name = f"{df_mean.index[i]} ({cluster_counts[ix]})"
    titles.append(cluster_name)
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])
  # last row titles
  titles.append("Whole Dataset")
  for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])

  # define the type of each column for each row
  specs = [[None]+ [{'type': 'scatter', 'colspan': (cols-2)}] + [None]*(cols-2)]
  specs += [[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*(rows-1)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.3/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )
  
  # add silhouette row
  fig = plotSilhouette(fig, k, silhouette_avg, sample_silhouette_values, cluster_labels)

  polar_args = {}
  for i in range(1,rows-1):
    # plot radar plot of the mean of each numerical variable for a 
    # given cluster/class
    r = df_mean.iloc[sorted_index[i-1]]
    fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    x = df.loc[df[group_col] == df_mean.index[sorted_index[i-1]]]
    for j, categorical_col in enumerate(categorical_cols):
      fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors)
        
  # Plot mean as last row
  i += 1
  # plot radar plot of the mean of each numerical variable of entire dataset
  r = df.mean()
  fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, categorical_col in enumerate(categorical_cols):
    fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, 
                    i, j, topN=-1, fontsize=fontsize, marker_colors=marker_colors)

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [40]:
def plot_all():
  column = "cluster"
  df_DNA_clusters[target_col] = df[target_col]
  df_DNA_clusters[categorical_cols] = df[categorical_cols]
  df_DNA_clusters[additioncal_categorical_cols] = country_region_answers
  fixed_order_col_answers = {
    'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
    'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
    'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
    }
  theta = ["IUsers", "EnvImpact", "Age", "Gender"]
  filename = f'radar_hist_{link}_{k}clusters{is_postprocessing}.html'
  save_path = f'{radars_path}{metric}{vars}{if_regions}{if_std}/{filename}'
  row_height = 500
  plot_categorical_cols = []
  plot_categorical_cols += categorical_cols
  plot_categorical_cols += additioncal_categorical_cols
  plot_categorical_cols += [target_col]
  sorted_cols = additioncal_categorical_cols
  plot_radar_hist(df_DNA_clusters, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                  row_height, topN=10)

### Run here

In [41]:
k_per_linkage = {'weighted': [11]}
postprocessing_values = [False, True]
for link in k_per_linkage.keys():
  print(f">> link = {link}")
  for k in k_per_linkage[link]:
    print(f">>>> k = {k} clusters")
    for postprocessing in postprocessing_values:
      print(f">>>>>> postprocessing = {postprocessing} ")
      is_postprocessing = "_fix" if postprocessing else ""
      silhouette_avg, sample_silhouette_values, cluster_labels = postprocessClusterLabels(distance_matrix, metric, link, k, postprocessing)
      df_DNA_clusters["cluster"] = cluster_labels
      plot_all()

>> link = weighted
>>>> k = 11 clusters
>>>>>> postprocessing = False 
For n_clusters = 11 The average silhouette_score is : 0.22804085279536818
Int64Index([8, 6, 5, 4, 7, 9, 0, 1, 10, 3, 2], dtype='int64')
cluser sizes: [10933  5478  3068  2722  1438   966   788   608   514    48    42]
For n_clusters = 11 The average silhouette_score is : 0.22804085279536818
>>>>>> postprocessing = True 
For n_clusters = 11 The average silhouette_score is : 0.22804085279536818
To be fixed len:  (1108,)
While fix: For n_clusters = 11 The average silhouette_score is : 0.2755365883559065
[ 1  2  3  4  5  6  7  8  9 10 11]
To be fixed len:  (692,)
While fix: For n_clusters = 11 The average silhouette_score is : 0.27980685930204496
[ 1  2  3  4  5  6  7  8  9 10 11]
To be fixed len:  (539,)
While fix: For n_clusters = 11 The average silhouette_score is : 0.28258271382786454
[ 1  2  3  4  5  6  7  8  9 10 11]
>>> After fix: For n_clusters = 10 (including outliers). The average silhouette_score is : 0.29118

# Plot radar and hist per country

In [42]:
column = "Country"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = 'entire_dataset_country_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
plot_radar_hist(df, column, numerical_cols, categorical_cols+[target_col], theta, save_path, row_height)

KeyError: ignored

# Plot radar and hist for pairs of clusters

In [None]:
column = "cluster"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
c1 = 48
c2 = 49
filename = f'entire_dataset_clusters({c1},{c2})_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
df_compare = df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c1])].append(df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c2])])
plot_radar_hist(df_compare, column, numerical_cols, categorical_cols, theta, save_path, row_height)

# Plot DNA wrt Target (Multi-class)

In [None]:
def plot_radar_hist_target(df, target_col, numerical_cols, categorical_cols, 
                    to_sort_cols, fixed_order_col_answers, theta, save_path=None, 
                    row_height=300, topN=6, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 14


  # Group by group_col attribute
  df_grouped = df.groupby(target_col)
  df_mean = df_grouped.mean()
  
  class_counts = {key: value for key, value in zip(df[target_col].value_counts().index, df[target_col].value_counts())}
  #print(f"class sizes: {class_counts}")

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+1


  # define the titles of each subplot
  titles = []
  target_answer_order = fixed_order_col_answers[target_col]
  for answer in target_answer_order:
    titles.append(f"{answer} ({class_counts[answer]})")
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])
  # last row titles
  titles.append("Whole Dataset")
  for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])

  # define the type of each column for each row
  #specs = [[None]+ [{'type': 'scatter', 'colspan': (cols-2)}] + [None]*(cols-2)]
  specs = [[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*(rows)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.3/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )

  polar_args = {}
  for i in range(rows-1):
    # plot radar plot of the mean of each numerical variable for a 
    # given cluster/class
    r = df_mean[df_mean.index.str.startswith(target_answer_order[i])].values[0]
    fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    x = df.loc[df[target_col] == target_answer_order[i]]
    for j, categorical_col in enumerate(categorical_cols):
      fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors)
        
  # Plot mean as last row
  i += 1
  # plot radar plot of the mean of each numerical variable of entire dataset
  r = df.mean()
  fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, categorical_col in enumerate(categorical_cols):
    fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, -1, fontsize, marker_colors)

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_multiclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)

# Plot DNA wrt Target (Binary-class)

In [None]:
df_2 = df[df[target_col] != 'Maybe yes maybe not']
df_2 = df_2[df_2[target_col] != "Don't know/no answer"]

target_map = {
    "Probably yes": "YES",
    "Certainly yes": "YES",
    "Probably not": "NO",
    "Certainly not": "NO",
}
df_2_fin = df_2.copy()
df_2_fin[target_col] = df_2[target_col].replace(target_map)

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['NO', 'YES']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_binaryclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df_2_fin, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)