In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
%matplotlib inline
import sys
sys.path.append("/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code")
np.set_printoptions(precision=5, suppress=True)

In [3]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/4_DNA_8values_v2.csv'

df_DNA = pd.read_csv(ds_path)

In [4]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plotClustersRadar(df, column, columns, save_path=None, plot_cols=5, row_height=300):
  titles = [f"{i}" for i in df.index]
  cols = plot_cols
  rows = int(np.ceil(len(df.index)/cols))
  #print(f"rows: {rows}, cols: {cols}")
  fig = make_subplots(rows=rows, cols=cols,
                      specs=[[{'type': 'polar'}]*cols]*rows,
                      horizontal_spacing=0.5/cols,
                      vertical_spacing=0.3/rows,
                      subplot_titles=titles,
                      )
  polar_args = {}
  for i in range(rows):
    for j in range(cols):
      #print(f"i: {i+1}, j: {j+1}")
      if i*cols+j < len(df.index):
        fig.add_trace(go.Scatterpolar(
                            r=df.iloc[i*cols+j],
                            theta=columns,
                            fill='toself',
                            name=f'Class {df.index[i*cols+j]}'), 
                        row=i+1, col=j+1
                      )
        polar_args[f"polar{i*cols+j+1}"] = dict(
                                              radialaxis=dict(
                                                visible=True,
                                                range=[0.0, 1.0]
                                              )
                                            )
  fig.update_layout(
      height=row_height*rows,
      showlegend=False,
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  fig.show()

# Entire dataset

In [5]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

In [6]:
algo='HC'
metric = 'VDM'
vars = 6
regions = False
if vars != 7:
  regions = False
if_regions = "_regions" if regions else ""

base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"

radars_path = f'{pictures_path}RadarCharts/'

In [None]:
# may cause RAM issues
distance_matrix = np.load(f"/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/Data/distance_matrix_{metric}{vars}{if_regions}.npy")

## Get cluster labels

In [None]:
from metrics import silhouette
import importlib
importlib.reload(silhouette)

<module 'metrics.silhouette' from '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/metrics/silhouette.py'>

In [None]:
from metrics.silhouette import getSilhouette

def postprocessClusterLabels(distance_matrix, metric, link, k, postprocessing=False):
  file_type = ".npy"
  filename = f"{algo}_{metric}_{link}{file_type}"
  HC_path = f'{HC_base_path}{metric}{vars}{if_regions}/{filename}'

  Z = {}
  Z[metric] = np.load(HC_path)

  cluster_labels = fcluster(Z[metric], k , criterion='maxclust')
  silhouette_avg, sample_silhouette_values, cluster_labels = getSilhouette(distance_matrix, cluster_labels, postprocessing)

  return silhouette_avg, sample_silhouette_values, cluster_labels

In [None]:
df_DNA_clusters = df_DNA.copy()

# Radar Chart of Entire dataset wrt target

In [None]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_filtered_values.csv'

df = pd.read_csv(ds_path)

In [None]:
cleanup_nums = {"Concern_environmental_impacts":
                  {
                      '1': 1,
                      '2': 2,
                      '3': 3,
                      '4': 4,
                      '5': 5,
                      '6': 6,
                      '7': 7,
                      '8': 8,
                      '9': 9,
                      '10': 10,
                      "Don't know": 0,
                  },
                "grouped_Age":
                  {
                      '16:23': 19.5,
                      '23:27': 25,
                      '27:30': 28.5,
                      '30:35': 32.5,
                      '35:39': 37,
                      '39:43': 41,
                      '43:47': 45,
                      '47:51': 49,
                      '51:54': 52.5,
                      '54:85': 69.5
                  },
                "Would_subsribe_car_sharing_if_available":
                  {
                      "Don't know / No answer": 0,
                      'No, I would not be interested in this service': 1,
                      'Maybe yes, maybe not. I would need to test the service before taking a decision': 2,
                      'Yes without any influence on my car ownership': 3,
                      'Yes, instead of purchasing a new car': 4,
                      'Yes and I would give up one car I currently own': 5,
                      "Yes I'm already client of a car sharing service": 6
                  },
                "Preference_tolls_or_traffic_limitation":
                  {
                      'No preferences': 0,
                      'Probably more acceptable to limit road traffic': 1,
                      'Probably more acceptable to pay for less congestion': 2,
                      'Definitely more acceptable to pay for less congestion': 3,
                      'Definitely more acceptable to limit road traffic': 4
                  },
                "Gender":
                  {
                      'Female': 0,
                      'Male': 1
                  },
                #"Considering_electric_or_hybrid_vehicle_next_purchase":
                #  {
                #      "Don't know/no answer":0,
                #      'Certainly not':1,
                #      'Probably not':2,                   
                #      'Maybe yes maybe not':3,
                #      'Probably yes':4,
                #      'Certainly yes':5
                #  }
                }

In [None]:
categorical_map = {"Would_subsribe_car_sharing_if_available":
                  {
                      "Don't know / No answer": 0,
                      'No, I would not be interested in this service': 1,
                      'Maybe yes, maybe not. I would need to test the service before taking a decision': 2,
                      'Yes without any influence on my car ownership': 3,
                      'Yes, instead of purchasing a new car': 4,
                      'Yes and I would give up one car I currently own': 5,
                      "Yes I'm already client of a car sharing service": 6
                  },
                "Preference_tolls_or_traffic_limitation":
                  {
                      'No preferences': 0,
                      'Probably more acceptable to limit road traffic': 1,
                      'Probably more acceptable to pay for less congestion': 2,
                      'Definitely more acceptable to pay for less congestion': 3,
                      'Definitely more acceptable to limit road traffic': 4
                  }}


In [None]:
df_DNA_clusters.columns

Index(['InternetUsers', 'Concern_environmental_impacts', 'grouped_Age',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Gender', 'grouped_Region_3',
       'Country'],
      dtype='object')

In [None]:
numerical_cols = ["InternetUsers", "Concern_environmental_impacts", "grouped_Age", "Gender"]
categorical_cols = ["Would_subsribe_car_sharing_if_available", "Preference_tolls_or_traffic_limitation"]
additioncal_categorical_cols = ["grouped_Region_3", "Country"]
target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
country_region_answers = df[additioncal_categorical_cols]
df = df[numerical_cols+categorical_cols+[target_col]]
df_numerical = df[numerical_cols]

In [None]:
reverse_categorical_map = {}
for key in categorical_cols:
  reverse_categorical_map[key] = {value : key for (key, value) in categorical_map[key].items()}
reverse_categorical_map

{'Preference_tolls_or_traffic_limitation': {0: 'No preferences',
  1: 'Probably more acceptable to limit road traffic',
  2: 'Probably more acceptable to pay for less congestion',
  3: 'Definitely more acceptable to pay for less congestion',
  4: 'Definitely more acceptable to limit road traffic'},
 'Would_subsribe_car_sharing_if_available': {0: "Don't know / No answer",
  1: 'No, I would not be interested in this service',
  2: 'Maybe yes, maybe not. I would need to test the service before taking a decision',
  3: 'Yes without any influence on my car ownership',
  4: 'Yes, instead of purchasing a new car',
  5: 'Yes and I would give up one car I currently own',
  6: "Yes I'm already client of a car sharing service"}}

In [None]:
categorical_map_abbreviated = {"Would_subsribe_car_sharing_if_available":
                {
                    "Don't know / No answer": "Don't know",
                    'No, I would not be interested in this service': "No",
                    'Maybe yes, maybe not. I would need to test the service before taking a decision': "Maybe, test",
                    'Yes without any influence on my car ownership': "Yes, no car influence",
                    'Yes, instead of purchasing a new car': "Yes, no new car",
                    'Yes and I would give up one car I currently own': "Yes, give up car",
                    "Yes I'm already client of a car sharing service": "Yes, already client"
                },
              "Preference_tolls_or_traffic_limitation":
                {
                    'No preferences': "No pref.",
                    'Probably more acceptable to limit road traffic': "Prob. limit traffic",
                    'Probably more acceptable to pay for less congestion': "Prob. pay",
                    'Definitely more acceptable to pay for less congestion': "Def. pay",
                    'Definitely more acceptable to limit road traffic': "Def. limit traffic"
                }
}

In [None]:
df = df.replace(cleanup_nums)

In [None]:
x = df[categorical_cols].values
categorical_min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = categorical_min_max_scaler.fit_transform(x)
df_categorical = pd.DataFrame(x_scaled)
df_categorical.columns = categorical_cols
df_categorical

Unnamed: 0,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation
0,0.333333,0.00
1,0.333333,0.25
2,0.166667,0.00
3,0.000000,0.50
4,0.333333,0.00
...,...,...
26600,0.666667,0.50
26601,0.000000,0.00
26602,0.333333,0.00
26603,0.666667,0.25


In [None]:
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,grouped_Region_3,Country
0,0.758621,0.4,0.59,0.333333,0.00,0.0,0.039604,0.037037
1,0.689655,0.8,0.11,0.333333,0.25,1.0,0.455446,0.333333
2,0.551724,0.6,0.59,0.166667,0.00,1.0,0.089109,0.185185
3,0.931034,0.8,1.00,0.000000,0.50,1.0,0.841584,1.000000
4,0.327586,0.8,0.35,0.333333,0.00,1.0,0.742574,0.777778
...,...,...,...,...,...,...,...,...
26600,0.551724,0.2,0.18,0.666667,0.50,0.0,0.079208,0.148148
26601,0.551724,0.4,0.26,0.000000,0.00,0.0,0.079208,0.148148
26602,0.551724,0.7,0.26,0.333333,0.00,1.0,0.079208,0.148148
26603,0.551724,0.7,0.35,0.666667,0.25,1.0,0.079208,0.148148


In [None]:
df_DNA_clusters[categorical_cols] = categorical_min_max_scaler.inverse_transform(df_DNA_clusters[categorical_cols])
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,grouped_Region_3,Country
0,0.758621,0.4,0.59,2.0,0.0,0.0,0.039604,0.037037
1,0.689655,0.8,0.11,2.0,1.0,1.0,0.455446,0.333333
2,0.551724,0.6,0.59,1.0,0.0,1.0,0.089109,0.185185
3,0.931034,0.8,1.00,0.0,2.0,1.0,0.841584,1.000000
4,0.327586,0.8,0.35,2.0,0.0,1.0,0.742574,0.777778
...,...,...,...,...,...,...,...,...
26600,0.551724,0.2,0.18,4.0,2.0,0.0,0.079208,0.148148
26601,0.551724,0.4,0.26,0.0,0.0,0.0,0.079208,0.148148
26602,0.551724,0.7,0.26,2.0,0.0,1.0,0.079208,0.148148
26603,0.551724,0.7,0.35,4.0,1.0,1.0,0.079208,0.148148


In [None]:
df_DNA_clusters = df_DNA_clusters.replace(reverse_categorical_map)

In [None]:
df_DNA_clusters = df_DNA_clusters.replace(categorical_map_abbreviated)

In [None]:
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,grouped_Region_3,Country
0,0.758621,0.4,0.59,"Maybe, test",No pref.,0.0,0.039604,0.037037
1,0.689655,0.8,0.11,"Maybe, test",Prob. limit traffic,1.0,0.455446,0.333333
2,0.551724,0.6,0.59,No,No pref.,1.0,0.089109,0.185185
3,0.931034,0.8,1.00,Don't know,Prob. pay,1.0,0.841584,1.000000
4,0.327586,0.8,0.35,"Maybe, test",No pref.,1.0,0.742574,0.777778
...,...,...,...,...,...,...,...,...
26600,0.551724,0.2,0.18,"Yes, no new car",Prob. pay,0.0,0.079208,0.148148
26601,0.551724,0.4,0.26,Don't know,No pref.,0.0,0.079208,0.148148
26602,0.551724,0.7,0.26,"Maybe, test",No pref.,1.0,0.079208,0.148148
26603,0.551724,0.7,0.35,"Yes, no new car",Prob. limit traffic,1.0,0.079208,0.148148


In [None]:
df = df.replace(reverse_categorical_map).replace(categorical_map_abbreviated)

In [None]:
x = df[numerical_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_numerical = pd.DataFrame(x_scaled)
df_numerical.columns = numerical_cols

In [None]:
df[numerical_cols] = df_numerical
df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Gender,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Considering_electric_or_hybrid_vehicle_next_purchase
0,0.758621,0.5,0.59,0.0,"Maybe, test",No pref.,Maybe yes maybe not
1,0.689655,0.8,0.11,1.0,"Maybe, test",Prob. limit traffic,Probably not
2,0.551724,0.6,0.59,1.0,No,No pref.,Certainly not
3,0.931034,0.8,1.00,1.0,Don't know,Prob. pay,Maybe yes maybe not
4,0.327586,0.8,0.35,1.0,"Maybe, test",No pref.,Probably yes
...,...,...,...,...,...,...,...
26600,0.551724,0.3,0.18,0.0,"Yes, no new car",Prob. pay,Don't know/no answer
26601,0.551724,0.5,0.26,0.0,Don't know,No pref.,Don't know/no answer
26602,0.551724,0.7,0.26,1.0,"Maybe, test",No pref.,Maybe yes maybe not
26603,0.551724,0.7,0.35,1.0,"Yes, no new car",Prob. limit traffic,Probably yes


In [None]:
column = target_col
df_numerical[column] = df[column]

columns = ["IUsers", "EnvImpact", "Age", "Gender"]
df_DNA_grouped = df_numerical.groupby(column)
df_radar = df_DNA_grouped.mean()
filename = 'entire_dataset_target.html'
save_path = f'{radars_path}{filename}'
plot_cols = 3
row_height = 400
#plotClustersRadar(df_radar, column, columns, save_path, plot_cols, row_height)

# Plot radar and hist

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import tools
import matplotlib.cm as cm

def plotSilhouette(fig, n_clusters, silhouette_avg, sample_silhouette_values, cluster_labels):

  silhouette_row = 1
  silhouette_col = 2
  print("For n_clusters =", n_clusters,
        "The average silhouette_score is :", silhouette_avg)


  x_lower = 10

  for i in np.unique(cluster_labels):
      # Aggregate the silhouette scores for samples belonging to
      # cluster i, and sort them
      ith_cluster_silhouette_values = \
          sample_silhouette_values[cluster_labels == i]

      ith_cluster_silhouette_values.sort()

      size_cluster_i = ith_cluster_silhouette_values.shape[0]
      x_upper = x_lower + size_cluster_i

      #colors = plt.cm.Spectral(cluster_labels.astype(float) / n_clusters)
      
      filled_area = go.Scatter(x=np.arange(x_lower, x_upper),
                                y=ith_cluster_silhouette_values,
                                mode='lines',
                                name=str(i),
                                showlegend=True,
                                line=dict(width=0.5,
                                        #color=colors
                                        ),
                                fill='tozeroy')
      fig.add_trace(filled_area, silhouette_row, silhouette_col)
      
      # Compute the new y_lower for next plot
      x_lower = x_upper + 10  # 10 for the 0 samples

  # The 1st subplot is the silhouette plot
  # The silhouette coefficient can range from -1, 1 but in this example all
  # lie within [-0.1, 1]
  fig.update_yaxes(title_text='The silhouette coefficient values',
                   row=silhouette_row, col=silhouette_col,
                   range=[-1, 1])

  # The (n_clusters+1)*10 is for inserting blank space between silhouette
  # plots of individual clusters, to demarcate them clearly.
  fig.update_xaxes(title_text='Cluster label',
                   row=silhouette_row, col=silhouette_col,
                   #showticklabels=False,
                   range=[0, len(df_DNA) + (n_clusters + 1) * 10])

  # The vertical line for average silhouette score of all the values
  axis_line = go.Scatter(y=[silhouette_avg]*100,
                         x=np.linspace(0, len(df_DNA), 100,),
                         showlegend=True,
                         name='silhouette avg',
                         mode='lines',
                         line=dict(color="red", dash='dash',
                                   width =1) )

  fig.add_trace(axis_line, silhouette_row, silhouette_col)

  return fig
    

In [None]:
import plotly.express as px # for colors

def plot_radar(fig, polar_args, r, theta, i,):
  fig.add_trace(go.Scatterpolar(
                        r=r,
                        theta=theta,
                        showlegend=False,
                        fill='toself',
                        line_color=px.colors.qualitative.Plotly[i%len(px.colors.qualitative.Plotly)]), 
                    row=i+1, col=1
                  )
  polar_args[f"polar{i}"] = dict(radialaxis=dict(visible=True,
                                                      range=[0.0, 1.0]
                                                      )
  )

  return fig, polar_args

def plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors):

  sorted_answers = df[categorical_col].unique()
  if (categorical_col in to_sort_cols):
    top_answers = x[categorical_col].value_counts().index[:topN]
    x = x.loc[x[categorical_col].isin(top_answers)]
    sorted_answers = x[categorical_col].value_counts().index
  
  if (categorical_col in fixed_order_col_answers.keys()):
    sorted_answers = fixed_order_col_answers[categorical_col]
    
  #print(df_mean.index[sorted_index[i]])
  fig.add_trace(go.Histogram(x=x[categorical_col],
                              name=categorical_col,
                              histnorm='probability',
                              showlegend=False,
                              marker_color=marker_colors[j]), 
                row=i+1, col=j+2,
                )
  
  fig.update_yaxes(range=[0, 1], row=i+1, col=j+2)
  fig.update_xaxes(categoryorder="array", 
                    categoryarray=sorted_answers,
                    autorange=False,
                    tickangle=90,
                    tickfont=dict(size=fontsize),
                    row=i+1, col=j+2)
  return fig

def plot_radar_hist(df, group_col, numerical_cols, categorical_cols, 
                    to_sort_cols, fixed_order_col_answers, theta, save_path=None, 
                    row_height=300, topN=6, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 14

  # check if plot by clusters
  is_clusters = True if group_col == "cluster" else False

  # Group by group_col attribute
  sorted_index = df.groupby(group_col,as_index=False).count().sort_values(by=df.columns[0],ascending=False).index
  print(sorted_index)
  df_grouped = df.groupby(group_col)
  df_mean = df_grouped.mean()
  
  cluster_counts = df["cluster"].value_counts().to_numpy()
  print(f"cluser sizes: {cluster_counts}")

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+2

  k = len(df['cluster'].unique())
  # define the titles of each subplot
  titles = [f"k={k} clusters with silhouette average: {silhouette_avg}"]
  for ix, i in enumerate(sorted_index):
    #print(df_mean.index[i])
    cluster_name = f"{df_mean.index[i]} ({cluster_counts[ix]})"
    titles.append(cluster_name)
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])
  # last row titles
  titles.append("Whole Dataset")
  for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])

  # define the type of each column for each row
  specs = [[None]+ [{'type': 'scatter', 'colspan': (cols-2)}] + [None]*(cols-2)]
  specs += [[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*(rows-1)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.3/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )
  
  # add silhouette row
  fig = plotSilhouette(fig, k, silhouette_avg, sample_silhouette_values, cluster_labels)

  polar_args = {}
  for i in range(1,rows-1):
    # plot radar plot of the mean of each numerical variable for a 
    # given cluster/class
    r = df_mean.iloc[sorted_index[i-1]]
    fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    x = df.loc[df[group_col] == df_mean.index[sorted_index[i-1]]]
    for j, categorical_col in enumerate(categorical_cols):
      fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors)
        
  # Plot mean as last row
  i += 1
  # plot radar plot of the mean of each numerical variable of entire dataset
  r = df.mean()
  fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, categorical_col in enumerate(categorical_cols):
    fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, 
                    i, j, topN=-1, fontsize=fontsize, marker_colors=marker_colors)

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [None]:
def plot_all():
  column = "cluster"
  df_DNA_clusters[target_col] = df[target_col]
  df_DNA_clusters[additioncal_categorical_cols] = country_region_answers
  fixed_order_col_answers = {
    'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
    'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
    'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
    }
  theta = ["IUsers", "EnvImpact", "Age", "Gender"]
  filename = f'radar_hist_{link}_{k}clusters{is_postprocessing}.html'
  save_path = f'{radars_path}{metric}{vars}{if_regions}/{filename}'
  row_height = 500
  plot_categorical_cols = []
  plot_categorical_cols += categorical_cols
  plot_categorical_cols += additioncal_categorical_cols
  plot_categorical_cols += [target_col]
  sorted_cols = additioncal_categorical_cols
  plot_radar_hist(df_DNA_clusters, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                  row_height, topN=10)

### Run here

In [None]:
k_per_linkage = {'weighted': [5]}
postprocessing_values = [False, True]
for link in k_per_linkage.keys():
  print(f">>>> link = {link}")
  for k in k_per_linkage[link]:
    print(f">> k = {k} clusters")
    for postprocessing in postprocessing_values:
      print(f">>>>>> postprocessing = {postprocessing} ")
      is_postprocessing = "_fix" if postprocessing else ""
      silhouette_avg, sample_silhouette_values, cluster_labels = postprocessClusterLabels(distance_matrix, metric, link, k, postprocessing)
      df_DNA_clusters["cluster"] = cluster_labels
      plot_all()
      

>>>> link = weighted
>> k = 5 clusters
>>>>>> postprocessing = False 
Ignoring clusters of size less than 200:
[]
For n_clusters = 5 The average silhouette_score is : 0.4202002404029098
Int64Index([1, 2, 0, 4, 3], dtype='int64')
cluser sizes: [16436  7252  1590   792   535]
For n_clusters = 5 The average silhouette_score is : 0.4202002404029098
>>>>>> postprocessing = True 
Ignoring clusters of size less than 200:
[]
For n_clusters = 5 The average silhouette_score is : 0.4202002404029098
Ignoring clusters of size less than 200:
[]
0.4202561413161783
To be fixed len:  (0,)
While fix: For n_clusters = 5 The average silhouette_score is : 0.4202561413161783
[1 2 3 4 5]
Ignoring clusters of size less than 200:
[]
>>> After fix: For n_clusters = 5 (including outliers) The average silhouette_score is : 0.4202561413161783
Int64Index([1, 2, 0, 4, 3], dtype='int64')
cluser sizes: [16438  7250  1590   792   535]
For n_clusters = 5 The average silhouette_score is : 0.4202561413161783


In [None]:
df_DNA_clusters["cluster"].value_counts()

2    16438
3     7250
1     1590
5      792
4      535
Name: cluster, dtype: int64

In [None]:
df_DNA_clusters.loc[df_DNA_clusters["cluster"] == 7, "grouped_Age"].value_counts()

Series([], Name: grouped_Age, dtype: int64)

#### 6 vars, Weighted 5 vs 11 k

```
2    16436
3     7252
1     1590
5      792
4      535
Name: cluster, dtype: int64
4     10018
7      7249
5      6418
1      1347
11      427
10      365
8       295
9       240
3       222
2        21
6         3
Name: cluster, dtype: int64
```

Let's keep clusters 3,1,5,4 from k=5, and try to use clusters 4 and 5 from k=11 instead of cluster 2 from k=5 (the biggest one). Sizes match.

In [None]:
metric = 'VDM'
link = 'weighted'
k_to_merge = [5, 11]
clusters_for_k = {
    5: [3, 1, 5, 4],
    11: [4, 5]
}
new_k = '5+11=6'

silhouette_avg = {}
sample_silhouette_values = {}
cluster_labels = {}

postprocessing = False
is_postprocessing = '_fix' if postprocessing else ''
for k in k_to_merge:
  silhouette_avg[k], sample_silhouette_values[k], cluster_labels[k] = postprocessClusterLabels(distance_matrix, metric, link, k, postprocessing)


cluster_labels[new_k] = np.zeros(len(df_DNA_clusters), dtype=np.int)
k_sum = 0
for k, clusters in clusters_for_k.items():
  mask = np.isin(cluster_labels[k], clusters)
  print(k, np.unique(cluster_labels[k][mask]))
  # k_sum to avoid overlapping clusters
  cluster_labels[new_k][mask] = cluster_labels[k][mask] + k_sum
  k_sum += k
cluster_labels = cluster_labels[new_k]
k = new_k

Ignoring clusters of size less than 200:
[]
For n_clusters = 5 The average silhouette_score is : 0.4202002404029098
Ignoring clusters of size less than 200:
[2 6]
For n_clusters = 11 The average silhouette_score is : 0.3078870370016914
5 [1 3 4 5]
11 [4 5]


In [None]:
for postprocessing in [False, True]:
  is_postprocessing = '_fix' if postprocessing else ''

  silhouette_avg, sample_silhouette_values, cluster_labels = \
    getSilhouette(distance_matrix, cluster_labels, postprocessing)
  df_DNA_clusters['cluster'] = cluster_labels

  plot_all()

Ignoring clusters of size less than 200:
[]
For n_clusters = 6 The average silhouette_score is : 0.3291953796605434
Int64Index([4, 1, 5, 0, 3, 2], dtype='int64')
cluser sizes: [10018  7252  6418  1590   792   535]
For n_clusters = 6 The average silhouette_score is : 0.3291953796605434
Ignoring clusters of size less than 200:
[]
For n_clusters = 6 The average silhouette_score is : 0.3291953796605434
Ignoring clusters of size less than 200:
[]
0.33982710290003
To be fixed len:  (124,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.3414895082036176
[ 1  3  4  5  9 10]
Ignoring clusters of size less than 200:
[]
0.3386773535298855
To be fixed len:  (61,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.33948965232652456
[ 1  3  4  5  9 10]
Ignoring clusters of size less than 200:
[-1]
>>> After fix: For n_clusters = 7 (including outliers) The average silhouette_score is : 0.33948965232652456
Int64Index([5, 2, 6, 1, 4, 3, 0], dtype='int64')
cluser sizes: 