In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Constants

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
hists_path = f'{pictures_path}HistCharts/'

numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

# Entire dataset (vars here)

In [3]:
vars = 5

In [4]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_filtered_values.csv'

df = pd.read_csv(ds_path)

In [5]:
df.columns

Index(['Country', 'Gender', 'Education', 'Profession', 'Work_status',
       'Household_members', 'Income_level', 'Location_of_resudence',
       'Centre_or_suburbs', 'Public_transport_service', 'Car_driving_license',
       'Considering_electric_or_hybrid_vehicle_next_purchase',
       'Know_what_car_sharing_is', 'Would_subsribe_car_sharing_if_available',
       'Most_frequent_trip_Walk', 'Most_frequent_trip_Bicycle',
       'Most_frequent_trip_Car_as_Driver',
       'Most_frequent_trip_Car_as_Passenger', 'Most_frequent_trip_Train',
       'Most_frequent_trip_Underground_or_light_train',
       'Most_frequent_trip_Tram', 'Most_frequent_trip_Bus',
       'Most_frequent_trip_Motorcycle_or_moped',
       'Destination_most_frequent_trip', 'Frequency_most_frequent_trip',
       'Frequent_trip_distance', 'Concern_environmental_impacts',
       'Preference_tolls_or_traffic_limitation',
       'grouped_Frequent_trip_duration_in_minutes', 'grouped_Region_3',
       'InternetUsers', 'grouped_Nu

In [6]:
numerical_cols = ["InternetUsers"]
categorical_cols = [
    "Concern_environmental_impacts",
    "Would_subsribe_car_sharing_if_available", 
    "Preference_tolls_or_traffic_limitation",
    "Country"
]
if vars == 6:
  categorical_cols.append("Location_of_resudence")

target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
df_DNA = df[numerical_cols+categorical_cols].copy()
df_DNAt = df_DNA.copy()
df_DNAt[target_col] = df[target_col]

In [7]:
for numerical_col in numerical_cols:
  df_DNAt[numerical_col] /= max(df_DNAt[numerical_col])
df_DNAt

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Considering_electric_or_hybrid_vehicle_next_purchase
0,0.842697,5,"Maybe yes, maybe not. I would need to test the...",No preferences,Belgium,Maybe yes maybe not
1,0.797753,8,"Maybe yes, maybe not. I would need to test the...",Probably more acceptable to limit road traffic,France,Probably not
2,0.707865,6,"No, I would not be interested in this service",No preferences,Czech Republic,Certainly not
3,0.955056,8,Don't know / No answer,Probably more acceptable to pay for less conge...,Sweden,Maybe yes maybe not
4,0.561798,8,"Maybe yes, maybe not. I would need to test the...",No preferences,Poland,Probably yes
...,...,...,...,...,...,...
26600,0.707865,3,"Yes, instead of purchasing a new car",Probably more acceptable to pay for less conge...,Cyprus,Don't know/no answer
26601,0.707865,5,Don't know / No answer,No preferences,Cyprus,Don't know/no answer
26602,0.707865,7,"Maybe yes, maybe not. I would need to test the...",No preferences,Cyprus,Maybe yes maybe not
26603,0.707865,7,"Yes, instead of purchasing a new car",Probably more acceptable to limit road traffic,Cyprus,Probably yes


In [8]:
df_DNA.columns

Index(['InternetUsers', 'Concern_environmental_impacts',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Country'],
      dtype='object')

In [9]:
df_DNAt.columns

Index(['InternetUsers', 'Concern_environmental_impacts',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Country',
       'Considering_electric_or_hybrid_vehicle_next_purchase'],
      dtype='object')

In [10]:
DNAt_map_abbreviated = {
  "Would_subsribe_car_sharing_if_available":
  {
      "Don't know / No answer": "Don't know",
      'No, I would not be interested in this service': "No",
      'Maybe yes, maybe not. I would need to test the service before taking a decision': "Maybe, test",
      'Yes without any influence on my car ownership': "Yes, no car influence",
      'Yes, instead of purchasing a new car': "Yes, no new car",
      'Yes and I would give up one car I currently own': "Yes, give up car",
      "Yes I'm already client of a car sharing service": "Yes, already client"
  },
  "Preference_tolls_or_traffic_limitation":
  {
      'No preferences': "No pref.",
      'Probably more acceptable to limit road traffic': "Prob. limit traffic",
      'Probably more acceptable to pay for less congestion': "Prob. pay",
      'Definitely more acceptable to pay for less congestion': "Def. pay",
      'Definitely more acceptable to limit road traffic': "Def. limit traffic"
  },
  "Location_of_resudence":
  {
      'Rural area': "Rural area",
      'Small or medium town (less than 250.000 inhabitants)': "area < 250k",
      'Large city (from 250.000 to 1.000.000 inhabitants)': "250k < area < 1M", 
      'Metropolitan area of a big city with more than 1.000.000  inhabitants': "1M < area",
  },
  "Considering_electric_or_hybrid_vehicle_next_purchase":
  {                 
      "Don't know/no answer": "Don't know",
      'Maybe yes maybe not': "Maybe"
  },
}

In [11]:
DNAt_map_abbreviated_columns = {
    "InternetUsers": "IUsers",
    "Concern_environmental_impacts": "EnvImpact",
    "Would_subsribe_car_sharing_if_available": "CarShare",
    "Preference_tolls_or_traffic_limitation": "TollsTraffic",
    "Location_of_resudence": "Residence",
    "Considering_electric_or_hybrid_vehicle_next_purchase": "EVs"
}

In [12]:
fixed_order_col_answers = {
    'EnvImpact': [
        '1',
        '2',
        '3',
        '4',
        '5',
        "Don't know",
        '6',
        '7',
        '8',
        '9',
        '10' 
    ],
    'CarShare': [
        'No', 
        "Don't know", 
        'Maybe, test', 
        'Yes, no car influence', 
        'Yes, no new car', 
        'Yes, give up car', 
        'Yes, already client'
    ],
    'TollsTraffic': [
        'Def. pay', 
        'Prob. pay', 
        'No pref.', 
        'Prob. limit traffic', 
        'Def. limit traffic'
    ],
    'Residence': [
        'Rural area',
        'area < 250k',
        '250k < area < 1M',
        '1M < area'
    ],
    'EVs': [
        'Certainly not', 
        'Probably not', 
        'Maybe', 
        "Don't know", 
        'Probably yes', 
        'Certainly yes']
    }

In [13]:
df_DNAt = df_DNAt.replace(DNAt_map_abbreviated)
df_DNAt = df_DNAt.rename(columns=DNAt_map_abbreviated_columns)

In [14]:
df_DNAt

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,EVs
0,0.842697,5,"Maybe, test",No pref.,Belgium,Maybe
1,0.797753,8,"Maybe, test",Prob. limit traffic,France,Probably not
2,0.707865,6,No,No pref.,Czech Republic,Certainly not
3,0.955056,8,Don't know,Prob. pay,Sweden,Maybe
4,0.561798,8,"Maybe, test",No pref.,Poland,Probably yes
...,...,...,...,...,...,...
26600,0.707865,3,"Yes, no new car",Prob. pay,Cyprus,Don't know
26601,0.707865,5,Don't know,No pref.,Cyprus,Don't know
26602,0.707865,7,"Maybe, test",No pref.,Cyprus,Maybe
26603,0.707865,7,"Yes, no new car",Prob. limit traffic,Cyprus,Probably yes


In [15]:
for i, col in enumerate(categorical_cols):
  if col in DNAt_map_abbreviated_columns.keys():
    categorical_cols[i] = DNAt_map_abbreviated_columns[col]
categorical_cols

['EnvImpact', 'CarShare', 'TollsTraffic', 'Country']

In [16]:
for i, col in enumerate(numerical_cols):
  if col in DNAt_map_abbreviated_columns.keys():
    numerical_cols[i] = DNAt_map_abbreviated_columns[col]
numerical_cols

['IUsers']

In [17]:
# select the top categories for columns with a lot of categories
cols_top_only = [column for column in categorical_cols if len(df_DNAt[column].unique()) > 11]
cols_top_only

['Country']

## Get cluster labels (link here)

Load cluster labels here from file

In [18]:
vars_linkage_cuts = {5:
                  {
                    "average":[[2,3,5,7,9],[2,3,5,7,9,12]],
                    "complete":[[2,4],[2,4,6],[2,4,6,8]],
                    "weighted":[[2,4,6,8],[2,4,6,8,11]],
                    "single":[[2,4,6,14,19,24,28,33,49,86]]
                  },
                6:
                  {
                    "average": [[2,3,5,7,9]],
                    "complete":[[2,3,5]],
                    "weighted":[[2,4,6,8,11,13]],
                    "single":[[2,4,6,15,19,25,29,31,41,59,75,85,124]]
                  }
}

In [19]:
metric = 'VDM'
link = 'single'
postprocessing = True

In [20]:
cut_values = vars_linkage_cuts[vars][link][0]
is_postprocessing = '_fix' if postprocessing else ''
cluster_labels_path = f"{HC_base_path}{metric}{vars}/ClusterLabels/ClusterLabels_{metric}_{link}_{str(cut_values)}{is_postprocessing}.npy"
cluster_labels = np.load(cluster_labels_path)

## Add clusters

In [21]:
dfc_DNAt = df_DNAt.assign(cluster=cluster_labels)

In [22]:
dfc_DNAt

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,EVs,cluster
0,0.842697,5,"Maybe, test",No pref.,Belgium,Maybe,42
1,0.797753,8,"Maybe, test",Prob. limit traffic,France,Probably not,59
2,0.707865,6,No,No pref.,Czech Republic,Certainly not,59
3,0.955056,8,Don't know,Prob. pay,Sweden,Maybe,59
4,0.561798,8,"Maybe, test",No pref.,Poland,Probably yes,-1
...,...,...,...,...,...,...,...
26600,0.707865,3,"Yes, no new car",Prob. pay,Cyprus,Don't know,59
26601,0.707865,5,Don't know,No pref.,Cyprus,Don't know,59
26602,0.707865,7,"Maybe, test",No pref.,Cyprus,Maybe,59
26603,0.707865,7,"Yes, no new car",Prob. limit traffic,Cyprus,Probably yes,59


# Plot hist

In [71]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px # for colors

def plotScatter(fig, df, x, numerical_col, i, j, fontsize):
  x, y = np.unique(x[numerical_col], return_counts=True)
  y = y.astype(float, copy=False)
  y /= np.sum(y).astype(float) # this normalizes as a percentage
  fig.add_trace(go.Scatter(x=x, y=y,
                  mode='lines',
                  name=numerical_col,
                  showlegend=False,
                  marker_color= \
                    px.colors.qualitative.Plotly[
                      (j-1)%len(px.colors.qualitative.Plotly)
                    ]), 
                row=i, col=j,
                )
  fig.update_yaxes(range=[0, 1], row=i, col=j)
  fig.update_xaxes(range=[0, 1], row=i, col=j)
  return fig

def plotHist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize):
  # print(i,j)
  sorted_answers = df[categorical_col].unique()
  if (categorical_col in to_sort_cols):
    top_answers = x[categorical_col].value_counts().index[:topN]
    x = x.loc[x[categorical_col].isin(top_answers)]
    sorted_answers = x[categorical_col].value_counts().index
  
  if (categorical_col in fixed_order_col_answers.keys()):
    sorted_answers = fixed_order_col_answers[categorical_col]
    
  #print(df_mean.index[sorted_index[i]])
  fig.add_trace(go.Histogram(x=x[categorical_col],
                  name=categorical_col,
                  histnorm='probability',
                  showlegend=False,
                  marker_color= \
                    px.colors.qualitative.Plotly[
                      (j-1)%len(px.colors.qualitative.Plotly)
                    ]), 
                row=i, col=j,
                )
  if categorical_col in to_sort_cols:
    fig.update_traces(nbinsx=11, row=i, col=j)
  fig.update_yaxes(range=[0, 1], row=i, col=j)
  fig.update_xaxes(type='category',
                    categoryorder="array", 
                    categoryarray=sorted_answers,
                    range=(-0.5,-0.5+len(sorted_answers)),
                    tickangle=60,
                    # 5 is reference number of columns to set fontsize for
                    tickfont=dict(size= \
                      fontsize * (1 + (5 - len(sorted_answers)) / (5 + len(sorted_answers)) / 1.0)),
                    row=i, col=j)
  return fig

def plotHists(df, group_col, numerical_cols, categorical_cols, 
              to_sort_cols, fixed_order_col_answers, cut_values, save_path=None, 
              row_height=300, topN=6, showFig=False, 
              columnNameCharLimit=27, fontsize=12):
  

  # check if plot by clusters
  is_clusters = True if group_col == "cluster" else False

  # Group by group_col attribute
  sorted_index = df.groupby(group_col,as_index=False).count() \
    .sort_values(by=df.columns[0],ascending=False).index
  # print(sorted_index)
  df_grouped = df.groupby(group_col)
  df_mean = df_grouped.mean()
  
  cluster_counts = df["cluster"].value_counts().to_numpy()
  # print(f"cluster sizes: {cluster_counts}")


  k = len(df['cluster'].unique())
  # has_outliers = np.isin(-1, df['cluster'].unique())
  # if has_outliers:
  #   k -= 1
  #titles = [f"{i}" for i in df_mean.index]
  cols = len(numerical_cols+categorical_cols)
  # rows = len(df_mean.index)+2
  rows = k + 1
  # define the titles of each subplot
  # titles = [f"k={k} clusters, merged cuts: {cut_values}"]
  titles = []
  for i in range(rows):
    for numerical_col in numerical_cols:
      titles.append(numerical_col[:columnNameCharLimit])
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])
  # define the type of each column for each row
  specs = [[{'type': 'xy'}]*cols]*(rows)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.2/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      shared_yaxes=True
                      )
  
  for i in range(1,rows):
    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    # x is the dataset only containing this cluster
    x = df.loc[df[group_col] == df_mean.index[sorted_index[i-1]]]
    for j, numerical_col in enumerate(numerical_cols):
      fig = plotScatter(fig, df, x, numerical_col, i, j+1, fontsize)
    j += 1
    for categorical_col in categorical_cols:
      fig = plotHist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, 
                     i, j+1, topN, fontsize)
      j += 1
    fig.update_yaxes(
        title_text=f"{df_mean.index[sorted_index[i-1]]} ({cluster_counts[i-1]})", 
        row=i, col=1)
  # Plot mean as last row
  i += 1

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, numerical_col in enumerate(numerical_cols):
    fig = plotScatter(fig, df, x, numerical_col, i, j+1, fontsize)
  j += 1
  for categorical_col in categorical_cols:
    fig = plotHist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, 
                   i, j+1, topN=-1, fontsize=fontsize)
    j += 1
  fig.update_yaxes(title_text="Whole Dataset", row=i, col=1)

  # Update layout
  fig.update_layout(
      title=f"k={k} clusters, merged cuts: {cut_values}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [72]:
def plotAll(save_or_show='show'):
  column = "cluster"
  filename = f'hist_{metric}{vars}_{link}_{cut_values}{is_postprocessing}{image_file_type}'
  save_path = f'{hists_path}{metric}{vars}/{filename}'
  row_height = 500
  plot_categorical_cols = categorical_cols
  plot_numerical_cols = numerical_cols
  if save_or_show == 'show':
    save_path=None
    showFig=True
  elif save_or_show == 'save':
    showFig=False
  else:
    print('save_or_show must be either "save" or "show"')
  plotHists(dfc_DNAt, column, plot_numerical_cols, plot_categorical_cols, 
            cols_top_only, fixed_order_col_answers, cut_values, save_path=save_path,
            row_height=row_height, topN=10, showFig=showFig)

In [73]:
# plotAll()

In [74]:
filename = f'hist_{metric}{vars}_{link}_{cut_values}{is_postprocessing}{image_file_type}'
save_path = f'{hists_path}{filename}'
row_height = 400
save_path
#plotClustersRadar(df_radar, column, columns, save_path, plot_cols, row_height)

'/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Pictures/HistCharts/hist_VDM5_single_[2, 4, 6, 14, 19, 24, 28, 33, 49, 86]_fix.html'

### Run here

In [77]:
metric="VDM"
file_type = ".npy"
wss_file_type = ".npy"
vars_values = [5,6]
linkages = ["single","complete", "average", "weighted"]
postprocessing_values = [False, True]
for vars in vars_values:
  print(f'\n\n\nvars: {vars}\n')
  for link in linkages:
    print(f">> link = {link}")
    for cut_values in vars_linkage_cuts[vars][link]:
      print(f">>>> cut_values: {cut_values} ...")
      for postprocessing in postprocessing_values:
        print(f">>>>>> postprocessing = {postprocessing} ")
        is_postprocessing = "_fix" if postprocessing else ""
        cluster_labels_path = f"{HC_base_path}{metric}{vars}/ClusterLabels/" \
          f"ClusterLabels_{metric}_{link}_{str(cut_values)}{is_postprocessing}.npy"
        cluster_labels = np.load(cluster_labels_path)
        dfc_DNAt["cluster"] = cluster_labels
        plotAll(save_or_show='save')




vars: 5

>> link = single
>>>> cut_values: [2, 4, 6, 14, 19, 24, 28, 33, 49, 86] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 
>> link = complete
>>>> cut_values: [2, 4] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 
>>>> cut_values: [2, 4, 6] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 
>>>> cut_values: [2, 4, 6, 8] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 
>> link = average
>>>> cut_values: [2, 3, 5, 7, 9] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 
>>>> cut_values: [2, 3, 5, 7, 9, 12] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 
>> link = weighted
>>>> cut_values: [2, 4, 6, 8] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 
>>>> cut_values: [2, 4, 6, 8, 11] ...
>>>>>> postprocessing = False 
>>>>>> postprocessing = True 



vars: 6

>> link = single
>>>> cut_values: [2, 4, 6, 15, 19, 25, 29, 31, 41, 59, 75, 85, 124] ...
>>>>>> postprocessing = 

# Plot radar and hist per country

In [None]:
column = "Country"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = 'entire_dataset_country_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
plot_radar_hist(df, column, numerical_cols, categorical_cols+[target_col], theta, save_path, row_height)

KeyError: ignored

# Plot radar and hist for pairs of clusters

In [None]:
column = "cluster"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
c1 = 48
c2 = 49
filename = f'entire_dataset_clusters({c1},{c2})_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
df_compare = df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c1])].append(df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c2])])
plot_radar_hist(df_compare, column, numerical_cols, categorical_cols, theta, save_path, row_height)

# Plot DNA wrt Target (Multi-class)

In [None]:
def plot_radar_hist_target(df, target_col, numerical_cols, categorical_cols, 
                    to_sort_cols, fixed_order_col_answers, theta, save_path=None, 
                    row_height=300, topN=6, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 14


  # Group by group_col attribute
  df_grouped = df.groupby(target_col)
  df_mean = df_grouped.mean()
  
  class_counts = {key: value for key, value in zip(df[target_col].value_counts().index, df[target_col].value_counts())}
  #print(f"class sizes: {class_counts}")

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+1


  # define the titles of each subplot
  titles = []
  target_answer_order = fixed_order_col_answers[target_col]
  for answer in target_answer_order:
    titles.append(f"{answer} ({class_counts[answer]})")
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])
  # last row titles
  titles.append("Whole Dataset")
  for categorical_col in categorical_cols:
      titles.append(categorical_col[:columnNameCharLimit])

  # define the type of each column for each row
  #specs = [[None]+ [{'type': 'scatter', 'colspan': (cols-2)}] + [None]*(cols-2)]
  specs = [[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*(rows)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.3/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )

  polar_args = {}
  for i in range(rows-1):
    # plot radar plot of the mean of each numerical variable for a 
    # given cluster/class
    r = df_mean[df_mean.index.str.startswith(target_answer_order[i])].values[0]
    fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    x = df.loc[df[target_col] == target_answer_order[i]]
    for j, categorical_col in enumerate(categorical_cols):
      fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors)
        
  # Plot mean as last row
  i += 1
  # plot radar plot of the mean of each numerical variable of entire dataset
  r = df.mean()
  fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, categorical_col in enumerate(categorical_cols):
    fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, -1, fontsize, marker_colors)

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_multiclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)

# Plot DNA wrt Target (Binary-class)

In [None]:
df_2 = df[df[target_col] != 'Maybe yes maybe not']
df_2 = df_2[df_2[target_col] != "Don't know/no answer"]

target_map = {
    "Probably yes": "YES",
    "Certainly yes": "YES",
    "Probably not": "NO",
    "Certainly not": "NO",
}
df_2_fin = df_2.copy()
df_2_fin[target_col] = df_2[target_col].replace(target_map)

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['NO', 'YES']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_binaryclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df_2_fin, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)