In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plotClustersRadar(df, column, columns, save_path=None, plot_cols=5, row_height=300):
  titles = [f"{i}" for i in df.index]
  cols = plot_cols
  rows = int(np.ceil(len(df.index)/cols))
  #print(f"rows: {rows}, cols: {cols}")
  fig = make_subplots(rows=rows, cols=cols,
                      specs=[[{'type': 'polar'}]*cols]*rows,
                      horizontal_spacing=0.5/cols,
                      vertical_spacing=0.3/rows,
                      subplot_titles=titles,
                      )
  polar_args = {}
  for i in range(rows):
    for j in range(cols):
      #print(f"i: {i+1}, j: {j+1}")
      if i*cols+j < len(df.index):
        fig.add_trace(go.Scatterpolar(
                            r=df.iloc[i*cols+j],
                            theta=columns,
                            fill='toself',
                            name=f'Class {df.index[i*cols+j]}'), 
                        row=i+1, col=j+1
                      )
        polar_args[f"polar{i*cols+j+1}"] = dict(
                                              radialaxis=dict(
                                                visible=True,
                                                range=[0.0, 1.0]
                                              )
                                            )
  fig.update_layout(
      height=row_height*rows,
      showlegend=False,
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  fig.show()

# Entire dataset

In [8]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_filtered_values.csv'

df = pd.read_csv(ds_path)

In [12]:
df.assign(cluster=cluster_labels)

Unnamed: 0,Country,Gender,Education,Profession,Work_status,Household_members,Income_level,Location_of_resudence,Centre_or_suburbs,Public_transport_service,Car_driving_license,Considering_electric_or_hybrid_vehicle_next_purchase,Know_what_car_sharing_is,Would_subsribe_car_sharing_if_available,Most_frequent_trip_Walk,Most_frequent_trip_Bicycle,Most_frequent_trip_Car_as_Driver,Most_frequent_trip_Car_as_Passenger,Most_frequent_trip_Train,Most_frequent_trip_Underground_or_light_train,Most_frequent_trip_Tram,Most_frequent_trip_Bus,Most_frequent_trip_Motorcycle_or_moped,Destination_most_frequent_trip,Frequency_most_frequent_trip,Frequent_trip_distance,Concern_environmental_impacts,Preference_tolls_or_traffic_limitation,grouped_Frequent_trip_duration_in_minutes,grouped_Region_3,InternetUsers,grouped_Number_vehicles_in_household,grouped_Age,cluster
0,Belgium,Female,Upper secondary (high school or similar);,housewife,Not Employed,two,lower middle,Metropolitan area of a big city with more than...,in the suburbs,Well served by public transport,Yes,Maybe yes maybe not,Yes,"Maybe yes, maybe not. I would need to test the...",No,No,Yes,No,No,No,No,No,No,…It is outside an urban area,Make this trip every day/ every working day of...,3-5 KM,5,No preferences,20:21,BE2,75,1,47:51,46
1,France,Male,"Tertiary and higher (University degree, PhD or...",unemployed,Not Employed,four,low,Small or medium town (less than 250.000 inhabi...,in the centre of the city,Difficult to reach with public transport,Yes,Probably not,Yes,"Maybe yes, maybe not. I would need to test the...",No,No,Yes,No,No,No,No,No,No,"…In an urban area, different from where I live",Make this trip 2-3 days per week,11-20 KM,8,Probably more acceptable to limit road traffic,20:21,FRG,71,3,23:27,91
2,Czech Republic,Male,"Tertiary and higher (University degree, PhD or...",other employed worker,Employed,four,middle,Large city (from 250.000 to 1.000.000 inhabita...,in the suburbs,Well served by public transport,Yes,Certainly not,Yes,"No, I would not be interested in this service",Yes,Yes,Yes,No,No,No,No,No,No,"…In an urban area, same as where I live",Make this trip every day/ every working day of...,less than 3 KM,6,No preferences,20:21,CZ0,63,1,47:51,91
3,Sweden,Male,"Tertiary and higher (University degree, PhD or...",teacher/lecturer,Employed,four,higher middle,Metropolitan area of a big city with more than...,in the suburbs,Relatively served by public transport,Yes,Maybe yes maybe not,Unsure/ no answer,Don't know / No answer,No,No,No,No,No,Yes,No,No,No,"…In an urban area, same as where I live",Make this trip every day/ every working day of...,11-20 KM,8,Probably more acceptable to pay for less conge...,60:75,SE1,85,1,54:85,91
4,Poland,Male,Upper secondary (high school or similar);,manual worker/agricultural worker/farmer,Employed,four,higher middle,Metropolitan area of a big city with more than...,in the suburbs,Well served by public transport,No,Probably yes,No,"Maybe yes, maybe not. I would need to test the...",No,No,No,No,No,No,No,No,Yes,"…In an urban area, same as where I live",Make this trip every day/ every working day of...,11-20 KM,8,No preferences,30:31,PL6,50,1,35:39,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26600,Cyprus,Female,"Tertiary and higher (University degree, PhD or...",middle manager,Employed,three,lower middle,Rural area,Rural area,Relatively served by public transport,Yes,Don't know/no answer,No,"Yes, instead of purchasing a new car",No,No,Yes,No,No,No,No,No,No,"…In an urban area, different from where I live",Make this trip every day/ every working day of...,31-50 KM,3,Probably more acceptable to pay for less conge...,40:60,CY,63,3,27:30,91
26601,Cyprus,Female,"Tertiary and higher (University degree, PhD or...",middle manager,Employed,four,middle,Small or medium town (less than 250.000 inhabi...,in the suburbs,Not served by public transport,Yes,Don't know/no answer,No,Don't know / No answer,No,No,Yes,No,No,No,No,No,No,"…In an urban area, same as where I live",Make this trip 2-3 days per week,21-30 KM,5,No preferences,20:21,CY,63,2,30:35,91
26602,Cyprus,Male,Upper secondary (high school or similar);,middle manager,Employed,two,lower middle,Small or medium town (less than 250.000 inhabi...,in the suburbs,Relatively served by public transport,Yes,Maybe yes maybe not,Yes,"Maybe yes, maybe not. I would need to test the...",No,No,Yes,No,No,No,No,No,No,"…In an urban area, same as where I live",Make this trip every day/ every working day of...,21-30 KM,7,No preferences,20:21,CY,63,2,30:35,91
26603,Cyprus,Male,"Tertiary and higher (University degree, PhD or...",middle manager,Employed,one (myself),middle,Small or medium town (less than 250.000 inhabi...,in the suburbs,Relatively served by public transport,Yes,Probably yes,No,"Yes, instead of purchasing a new car",No,No,Yes,No,No,No,No,No,No,"…In an urban area, same as where I live",Make this trip every day/ every working day of...,6-10 KM,7,Probably more acceptable to limit road traffic,20:21,CY,63,1,35:39,91


In [14]:
df.columns

Index(['Country', 'Gender', 'Education', 'Profession', 'Work_status',
       'Household_members', 'Income_level', 'Location_of_resudence',
       'Centre_or_suburbs', 'Public_transport_service', 'Car_driving_license',
       'Considering_electric_or_hybrid_vehicle_next_purchase',
       'Know_what_car_sharing_is', 'Would_subsribe_car_sharing_if_available',
       'Most_frequent_trip_Walk', 'Most_frequent_trip_Bicycle',
       'Most_frequent_trip_Car_as_Driver',
       'Most_frequent_trip_Car_as_Passenger', 'Most_frequent_trip_Train',
       'Most_frequent_trip_Underground_or_light_train',
       'Most_frequent_trip_Tram', 'Most_frequent_trip_Bus',
       'Most_frequent_trip_Motorcycle_or_moped',
       'Destination_most_frequent_trip', 'Frequency_most_frequent_trip',
       'Frequent_trip_distance', 'Concern_environmental_impacts',
       'Preference_tolls_or_traffic_limitation',
       'grouped_Frequent_trip_duration_in_minutes', 'grouped_Region_3',
       'InternetUsers', 'grouped_Nu

In [19]:
categorical_cols = ["InternetUsers", 
                  "Concern_environmental_impacts",
                  "Would_subsribe_car_sharing_if_available", 
                  "Preference_tolls_or_traffic_limitation",
                  "Country"]
if vars == 6:
  categorical_cols.append("Location_of_resudence")

target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
df_DNA = df[categorical_cols].copy()
df_DNAt = df_DNA.copy()
df_DNAt[target_col] = df[target_col]

In [20]:
df_DNA.columns

Index(['InternetUsers', 'Concern_environmental_impacts',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Country',
       'Location_of_resudence'],
      dtype='object')

In [21]:
df_DNAt.columns

Index(['InternetUsers', 'Concern_environmental_impacts',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Country',
       'Location_of_resudence',
       'Considering_electric_or_hybrid_vehicle_next_purchase'],
      dtype='object')

In [None]:
# numerical_cols = ["InternetUsers", "Concern_environmental_impacts", "grouped_Age", "Gender"]
# categorical_cols = ["Would_subsribe_car_sharing_if_available", "Preference_tolls_or_traffic_limitation"]
# additioncal_categorical_cols = ["grouped_Region_3", "Country"]
# target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
# country_region_answers = df[additioncal_categorical_cols]
# df = df[numerical_cols+categorical_cols+[target_col]]
# df_numerical = df[numerical_cols]

In [29]:
DNAt_map_abbreviated = {
  "Would_subsribe_car_sharing_if_available":
  {
      "Don't know / No answer": "Don't know",
      'No, I would not be interested in this service': "No",
      'Maybe yes, maybe not. I would need to test the service before taking a decision': "Maybe, test",
      'Yes without any influence on my car ownership': "Yes, no car influence",
      'Yes, instead of purchasing a new car': "Yes, no new car",
      'Yes and I would give up one car I currently own': "Yes, give up car",
      "Yes I'm already client of a car sharing service": "Yes, already client"
  },
  "Preference_tolls_or_traffic_limitation":
  {
      'No preferences': "No pref.",
      'Probably more acceptable to limit road traffic': "Prob. limit traffic",
      'Probably more acceptable to pay for less congestion': "Prob. pay",
      'Definitely more acceptable to pay for less congestion': "Def. pay",
      'Definitely more acceptable to limit road traffic': "Def. limit traffic"
  },
  "Location_of_resudence":
  {
      'Rural area': "Rural area",
      'Small or medium town (less than 250.000 inhabitants)': "area < 250k",
      'Large city (from 250.000 to 1.000.000 inhabitants)': "250k < area < 1M", 
      'Metropolitan area of a big city with more than 1.000.000  inhabitants': "1M < area",
  },
  "Considering_electric_or_hybrid_vehicle_next_purchase":
  {                 
      "Don't know/no answer": "Don't know",
      'Maybe yes maybe not': "Maybe"
  },
}

In [30]:
DNAt_map_abbreviated_columns = {
    "InternetUsers": "IUsers",
    "Concern_environmental_impacts": "EnvImpact",
    "Would_subsribe_car_sharing_if_available": "CarShare",
    "Preference_tolls_or_traffic_limitation": "TollsTraffic",
    "Location_of_resudence": "Residence",
    "Considering_electric_or_hybrid_vehicle_next_purchase": "EVs"
}

In [33]:
df_DNAt = df_DNAt.replace(DNAt_map_abbreviated)
df_DNAt = df_DNAt.rename(columns=DNAt_map_abbreviated_columns)

In [34]:
df_DNAt

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,Residence,EVs
0,75,5,"Maybe, test",No pref.,Belgium,1M < area,Maybe
1,71,8,"Maybe, test",Prob. limit traffic,France,area < 250k,Probably not
2,63,6,No,No pref.,Czech Republic,250k < area < 1M,Certainly not
3,85,8,Don't know,Prob. pay,Sweden,1M < area,Maybe
4,50,8,"Maybe, test",No pref.,Poland,1M < area,Probably yes
...,...,...,...,...,...,...,...
26600,63,3,"Yes, no new car",Prob. pay,Cyprus,Rural area,Don't know
26601,63,5,Don't know,No pref.,Cyprus,area < 250k,Don't know
26602,63,7,"Maybe, test",No pref.,Cyprus,area < 250k,Maybe
26603,63,7,"Yes, no new car",Prob. limit traffic,Cyprus,area < 250k,Probably yes


## Get cluster labels

Load cluster labels here from file

In [35]:
vars_linkage_cuts = {5:
                  {
                    "average":[[2,3,5,7,9],[2,3,5,7,9,12]],
                    "complete":[[2,4],[2,4,6],[2,4,6,8]],
                    "weighted":[[2,4,6,8],[2,4,6,8,11]],
                    "single":[[2,4,6,14,19,24,28,33,49,86]]
                  },
                6:
                  {
                    "average": [[2,3,5,7,9]],
                    "complete":[[2,3,5]],
                    "weighted":[[2,4,6,8,11,13]],
                    "single":[[2,4,6,15,19,25,29,31,41,59,75,85,124]]
                  }
}

In [36]:
metric = 'VDM'
vars = 6
link = 'single'
postprocessing = True

In [37]:
cut_values = vars_linkage_cuts[vars][link][0]
is_postprocessing = '_fix' if postprocessing else ''
cluster_labels_path = f"{HC_base_path}{metric}{vars}/ClusterLabels/ClusterLabels_{metric}_{link}_{str(cut_values)}{is_postprocessing}.npy"
cluster_labels = np.load(cluster_labels_path)

## Add clusters

In [38]:
dfc_DNAt = df_DNAt.assign(cluster=cluster_labels)

In [39]:
dfc_DNAt

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,Residence,EVs,cluster
0,75,5,"Maybe, test",No pref.,Belgium,1M < area,Maybe,46
1,71,8,"Maybe, test",Prob. limit traffic,France,area < 250k,Probably not,91
2,63,6,No,No pref.,Czech Republic,250k < area < 1M,Certainly not,91
3,85,8,Don't know,Prob. pay,Sweden,1M < area,Maybe,91
4,50,8,"Maybe, test",No pref.,Poland,1M < area,Probably yes,-1
...,...,...,...,...,...,...,...,...
26600,63,3,"Yes, no new car",Prob. pay,Cyprus,Rural area,Don't know,91
26601,63,5,Don't know,No pref.,Cyprus,area < 250k,Don't know,91
26602,63,7,"Maybe, test",No pref.,Cyprus,area < 250k,Maybe,91
26603,63,7,"Yes, no new car",Prob. limit traffic,Cyprus,area < 250k,Probably yes,91


In [None]:
column = target_col
df_numerical[column] = df[column]

columns = ["IUsers", "EnvImpact", "Age", "Gender"]
df_DNA_grouped = df_numerical.groupby(column)
df_radar = df_DNA_grouped.mean()
filename = 'entire_dataset_target.html'
save_path = f'{radars_path}{filename}'
plot_cols = 3
row_height = 400
#plotClustersRadar(df_radar, column, columns, save_path, plot_cols, row_height)

# Plot hist

In [None]:
import plotly.express as px # for colors

def plot_radar(fig, polar_args, r, theta, i,):
  fig.add_trace(go.Scatterpolar(
                        r=r,
                        theta=theta,
                        showlegend=False,
                        fill='toself',
                        line_color=px.colors.qualitative.Plotly[i%len(px.colors.qualitative.Plotly)]), 
                    row=i+1, col=1
                  )
  polar_args[f"polar{i}"] = dict(radialaxis=dict(visible=True,
                                                      range=[0.0, 1.0]
                                                      )
  )

  return fig, polar_args

def plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors):

  sorted_answers = df[categorical_col].unique()
  if (categorical_col in to_sort_cols):
    top_answers = x[categorical_col].value_counts().index[:topN]
    x = x.loc[x[categorical_col].isin(top_answers)]
    sorted_answers = x[categorical_col].value_counts().index
  
  if (categorical_col in fixed_order_col_answers.keys()):
    sorted_answers = fixed_order_col_answers[categorical_col]
    
  #print(df_mean.index[sorted_index[i]])
  fig.add_trace(go.Histogram(x=x[categorical_col],
                              name=categorical_col,
                              histnorm='probability',
                              showlegend=False,
                              marker_color=marker_colors[j]), 
                row=i+1, col=j+2,
                )
  
  fig.update_yaxes(range=[0, 1], row=i+1, col=j+2)
  fig.update_xaxes(categoryorder="array", 
                    categoryarray=sorted_answers,
                    autorange=False,
                    tickangle=90,
                    tickfont=dict(size=fontsize),
                    row=i+1, col=j+2)
  return fig

def plot_radar_hist(df, group_col, numerical_cols, categorical_cols, 
                    to_sort_cols, fixed_order_col_answers, theta, save_path=None, 
                    row_height=300, topN=6, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 14

  # check if plot by clusters
  is_clusters = True if group_col == "cluster" else False

  # Group by group_col attribute
  sorted_index = df.groupby(group_col,as_index=False).count().sort_values(by=df.columns[0],ascending=False).index
  print(sorted_index)
  df_grouped = df.groupby(group_col)
  df_mean = df_grouped.mean()
  
  cluster_counts = df["cluster"].value_counts().to_numpy()
  print(f"cluser sizes: {cluster_counts}")

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+2

  k = len(df['cluster'].unique())
  # define the titles of each subplot
  titles = [f"k={k} clusters with silhouette average: {silhouette_avg}"]
  for ix, i in enumerate(sorted_index):
    #print(df_mean.index[i])
    cluster_name = f"{df_mean.index[i]} ({cluster_counts[ix]})"
    titles.append(cluster_name)
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])
  # last row titles
  titles.append("Whole Dataset")
  for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])

  # define the type of each column for each row
  specs = [[None]+ [{'type': 'scatter', 'colspan': (cols-2)}] + [None]*(cols-2)]
  specs += [[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*(rows-1)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.3/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )
  
  # add silhouette row
  fig = plotSilhouette(fig, k, silhouette_avg, sample_silhouette_values, cluster_labels)

  polar_args = {}
  for i in range(1,rows-1):
    # plot radar plot of the mean of each numerical variable for a 
    # given cluster/class
    r = df_mean.iloc[sorted_index[i-1]]
    fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    x = df.loc[df[group_col] == df_mean.index[sorted_index[i-1]]]
    for j, categorical_col in enumerate(categorical_cols):
      fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors)
        
  # Plot mean as last row
  i += 1
  # plot radar plot of the mean of each numerical variable of entire dataset
  r = df.mean()
  fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, categorical_col in enumerate(categorical_cols):
    fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, 
                    i, j, topN=-1, fontsize=fontsize, marker_colors=marker_colors)

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [None]:
def plot_all():
  column = "cluster"
  df_DNA_clusters[target_col] = df[target_col]
  df_DNA_clusters[categorical_cols] = df[categorical_cols]
  df_DNA_clusters[additioncal_categorical_cols] = country_region_answers
  fixed_order_col_answers = {
    'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
    'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
    'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
    }
  theta = ["IUsers", "EnvImpact", "Age", "Gender"]
  filename = f'radar_hist_{link}_{k}clusters{is_postprocessing}.html'
  save_path = f'{radars_path}{metric}{vars}{if_regions}{if_std}/{filename}'
  row_height = 500
  plot_categorical_cols = []
  plot_categorical_cols += categorical_cols
  plot_categorical_cols += additioncal_categorical_cols
  plot_categorical_cols += [target_col]
  sorted_cols = additioncal_categorical_cols
  plot_radar_hist(df_DNA_clusters, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                  row_height, topN=10)

### Run here

In [None]:
k_per_linkage = {'weighted': [11]}
postprocessing_values = [False, True]
for link in k_per_linkage.keys():
  print(f">> link = {link}")
  for k in k_per_linkage[link]:
    print(f">>>> k = {k} clusters")
    for postprocessing in postprocessing_values:
      print(f">>>>>> postprocessing = {postprocessing} ")
      is_postprocessing = "_fix" if postprocessing else ""
      silhouette_avg, sample_silhouette_values, cluster_labels = postprocessClusterLabels(distance_matrix, metric, link, k, postprocessing)
      df_DNA_clusters["cluster"] = cluster_labels
      plot_all()

>> link = weighted
>>>> k = 11 clusters
>>>>>> postprocessing = False 
For n_clusters = 11 The average silhouette_score is : 0.22804085279536818
Int64Index([8, 6, 5, 4, 7, 9, 0, 1, 10, 3, 2], dtype='int64')
cluser sizes: [10933  5478  3068  2722  1438   966   788   608   514    48    42]
For n_clusters = 11 The average silhouette_score is : 0.22804085279536818
>>>>>> postprocessing = True 
For n_clusters = 11 The average silhouette_score is : 0.22804085279536818
To be fixed len:  (1108,)
While fix: For n_clusters = 11 The average silhouette_score is : 0.2755365883559065
[ 1  2  3  4  5  6  7  8  9 10 11]
To be fixed len:  (692,)
While fix: For n_clusters = 11 The average silhouette_score is : 0.27980685930204496
[ 1  2  3  4  5  6  7  8  9 10 11]
To be fixed len:  (539,)
While fix: For n_clusters = 11 The average silhouette_score is : 0.28258271382786454
[ 1  2  3  4  5  6  7  8  9 10 11]
>>> After fix: For n_clusters = 10 (including outliers). The average silhouette_score is : 0.29118

# Plot radar and hist per country

In [None]:
column = "Country"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = 'entire_dataset_country_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
plot_radar_hist(df, column, numerical_cols, categorical_cols+[target_col], theta, save_path, row_height)

KeyError: ignored

# Plot radar and hist for pairs of clusters

In [None]:
column = "cluster"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
c1 = 48
c2 = 49
filename = f'entire_dataset_clusters({c1},{c2})_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
df_compare = df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c1])].append(df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c2])])
plot_radar_hist(df_compare, column, numerical_cols, categorical_cols, theta, save_path, row_height)

# Plot DNA wrt Target (Multi-class)

In [None]:
def plot_radar_hist_target(df, target_col, numerical_cols, categorical_cols, 
                    to_sort_cols, fixed_order_col_answers, theta, save_path=None, 
                    row_height=300, topN=6, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 14


  # Group by group_col attribute
  df_grouped = df.groupby(target_col)
  df_mean = df_grouped.mean()
  
  class_counts = {key: value for key, value in zip(df[target_col].value_counts().index, df[target_col].value_counts())}
  #print(f"class sizes: {class_counts}")

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+1


  # define the titles of each subplot
  titles = []
  target_answer_order = fixed_order_col_answers[target_col]
  for answer in target_answer_order:
    titles.append(f"{answer} ({class_counts[answer]})")
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])
  # last row titles
  titles.append("Whole Dataset")
  for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])

  # define the type of each column for each row
  #specs = [[None]+ [{'type': 'scatter', 'colspan': (cols-2)}] + [None]*(cols-2)]
  specs = [[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*(rows)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.3/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )

  polar_args = {}
  for i in range(rows-1):
    # plot radar plot of the mean of each numerical variable for a 
    # given cluster/class
    r = df_mean[df_mean.index.str.startswith(target_answer_order[i])].values[0]
    fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    x = df.loc[df[target_col] == target_answer_order[i]]
    for j, categorical_col in enumerate(categorical_cols):
      fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors)
        
  # Plot mean as last row
  i += 1
  # plot radar plot of the mean of each numerical variable of entire dataset
  r = df.mean()
  fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, categorical_col in enumerate(categorical_cols):
    fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, -1, fontsize, marker_colors)

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_multiclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)

# Plot DNA wrt Target (Binary-class)

In [None]:
df_2 = df[df[target_col] != 'Maybe yes maybe not']
df_2 = df_2[df_2[target_col] != "Don't know/no answer"]

target_map = {
    "Probably yes": "YES",
    "Certainly yes": "YES",
    "Probably not": "NO",
    "Certainly not": "NO",
}
df_2_fin = df_2.copy()
df_2_fin[target_col] = df_2[target_col].replace(target_map)

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['NO', 'YES']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_binaryclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df_2_fin, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)