In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Constants

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
hists_path = f'{pictures_path}HistCharts/'

numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

# Entire dataset (vars here)

In [4]:
vars = 6

In [5]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_filtered_values.csv'

df = pd.read_csv(ds_path)

In [6]:
df.columns

Index(['Country', 'Gender', 'Education', 'Profession', 'Work_status',
       'Household_members', 'Income_level', 'Location_of_resudence',
       'Centre_or_suburbs', 'Public_transport_service', 'Car_driving_license',
       'Considering_electric_or_hybrid_vehicle_next_purchase',
       'Know_what_car_sharing_is', 'Would_subsribe_car_sharing_if_available',
       'Most_frequent_trip_Walk', 'Most_frequent_trip_Bicycle',
       'Most_frequent_trip_Car_as_Driver',
       'Most_frequent_trip_Car_as_Passenger', 'Most_frequent_trip_Train',
       'Most_frequent_trip_Underground_or_light_train',
       'Most_frequent_trip_Tram', 'Most_frequent_trip_Bus',
       'Most_frequent_trip_Motorcycle_or_moped',
       'Destination_most_frequent_trip', 'Frequency_most_frequent_trip',
       'Frequent_trip_distance', 'Concern_environmental_impacts',
       'Preference_tolls_or_traffic_limitation',
       'grouped_Frequent_trip_duration_in_minutes', 'grouped_Region_3',
       'InternetUsers', 'grouped_Nu

In [7]:
numerical_cols = ["InternetUsers"]
categorical_cols = [
    "Concern_environmental_impacts",
    "Would_subsribe_car_sharing_if_available", 
    "Preference_tolls_or_traffic_limitation",
    "Country",
    "Location_of_resudence"
]

target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
df_DNA = df[numerical_cols+categorical_cols].copy()
df_DNAt = df_DNA.copy()
df_DNAt[target_col] = df[target_col]

In [8]:
for numerical_col in numerical_cols:
  df_DNAt[numerical_col] /= max(df_DNAt[numerical_col])
df_DNAt

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,Considering_electric_or_hybrid_vehicle_next_purchase
0,0.842697,5,"Maybe yes, maybe not. I would need to test the...",No preferences,Belgium,Metropolitan area of a big city with more than...,Maybe yes maybe not
1,0.797753,8,"Maybe yes, maybe not. I would need to test the...",Probably more acceptable to limit road traffic,France,Small or medium town (less than 250.000 inhabi...,Probably not
2,0.707865,6,"No, I would not be interested in this service",No preferences,Czech Republic,Large city (from 250.000 to 1.000.000 inhabita...,Certainly not
3,0.955056,8,Don't know / No answer,Probably more acceptable to pay for less conge...,Sweden,Metropolitan area of a big city with more than...,Maybe yes maybe not
4,0.561798,8,"Maybe yes, maybe not. I would need to test the...",No preferences,Poland,Metropolitan area of a big city with more than...,Probably yes
...,...,...,...,...,...,...,...
26600,0.707865,3,"Yes, instead of purchasing a new car",Probably more acceptable to pay for less conge...,Cyprus,Rural area,Don't know/no answer
26601,0.707865,5,Don't know / No answer,No preferences,Cyprus,Small or medium town (less than 250.000 inhabi...,Don't know/no answer
26602,0.707865,7,"Maybe yes, maybe not. I would need to test the...",No preferences,Cyprus,Small or medium town (less than 250.000 inhabi...,Maybe yes maybe not
26603,0.707865,7,"Yes, instead of purchasing a new car",Probably more acceptable to limit road traffic,Cyprus,Small or medium town (less than 250.000 inhabi...,Probably yes


In [9]:
df_DNA.columns

Index(['InternetUsers', 'Concern_environmental_impacts',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Country',
       'Location_of_resudence'],
      dtype='object')

In [10]:
df_DNAt.columns

Index(['InternetUsers', 'Concern_environmental_impacts',
       'Would_subsribe_car_sharing_if_available',
       'Preference_tolls_or_traffic_limitation', 'Country',
       'Location_of_resudence',
       'Considering_electric_or_hybrid_vehicle_next_purchase'],
      dtype='object')

In [11]:
DNAt_map_abbreviated = {
  "Would_subsribe_car_sharing_if_available":
  {
      "Don't know / No answer": "Don't know",
      'No, I would not be interested in this service': "No",
      'Maybe yes, maybe not. I would need to test the service before taking a decision': "Maybe, test",
      'Yes without any influence on my car ownership': "Yes, no car influence",
      'Yes, instead of purchasing a new car': "Yes, no new car",
      'Yes and I would give up one car I currently own': "Yes, give up car",
      "Yes I'm already client of a car sharing service": "Yes, already client"
  },
  "Preference_tolls_or_traffic_limitation":
  {
      'No preferences': "No pref.",
      'Probably more acceptable to limit road traffic': "Prob. limit traffic",
      'Probably more acceptable to pay for less congestion': "Prob. pay",
      'Definitely more acceptable to pay for less congestion': "Def. pay",
      'Definitely more acceptable to limit road traffic': "Def. limit traffic"
  },
  "Location_of_resudence":
  {
      'Rural area': "Rural area",
      'Small or medium town (less than 250.000 inhabitants)': "area < 250k",
      'Large city (from 250.000 to 1.000.000 inhabitants)': "250k < area < 1M", 
      'Metropolitan area of a big city with more than 1.000.000  inhabitants': "1M < area",
  },
  "Considering_electric_or_hybrid_vehicle_next_purchase":
  {                 
      "Don't know/no answer": "Don't know",
      'Maybe yes maybe not': "Maybe"
  },
}

In [12]:
DNAt_map_abbreviated_columns = {
    "InternetUsers": "IUsers",
    "Concern_environmental_impacts": "EnvImpact",
    "Would_subsribe_car_sharing_if_available": "CarShare",
    "Preference_tolls_or_traffic_limitation": "TollsTraffic",
    "Location_of_resudence": "Residence",
    "Considering_electric_or_hybrid_vehicle_next_purchase": "EVs"
}

In [13]:
fixed_order_col_answers = {
    'EnvImpact': [
        '1',
        '2',
        '3',
        '4',
        '5',
        "Don't know",
        '6',
        '7',
        '8',
        '9',
        '10' 
    ],
    'CarShare': [
        'No', 
        "Don't know", 
        'Maybe, test', 
        'Yes, no car influence', 
        'Yes, no new car', 
        'Yes, give up car', 
        'Yes, already client'
    ],
    'TollsTraffic': [
        'Def. pay', 
        'Prob. pay', 
        'No pref.', 
        'Prob. limit traffic', 
        'Def. limit traffic'
    ],
    'Residence': [
        'Rural area',
        'area < 250k',
        '250k < area < 1M',
        '1M < area'
    ],
    'EVs': [
        'Certainly not', 
        'Probably not', 
        'Maybe', 
        "Don't know", 
        'Probably yes', 
        'Certainly yes']
    }

In [14]:
df_DNAt = df_DNAt.replace(DNAt_map_abbreviated)
df_DNAt = df_DNAt.rename(columns=DNAt_map_abbreviated_columns)

In [15]:
df_DNAt

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,Residence,EVs
0,0.842697,5,"Maybe, test",No pref.,Belgium,1M < area,Maybe
1,0.797753,8,"Maybe, test",Prob. limit traffic,France,area < 250k,Probably not
2,0.707865,6,No,No pref.,Czech Republic,250k < area < 1M,Certainly not
3,0.955056,8,Don't know,Prob. pay,Sweden,1M < area,Maybe
4,0.561798,8,"Maybe, test",No pref.,Poland,1M < area,Probably yes
...,...,...,...,...,...,...,...
26600,0.707865,3,"Yes, no new car",Prob. pay,Cyprus,Rural area,Don't know
26601,0.707865,5,Don't know,No pref.,Cyprus,area < 250k,Don't know
26602,0.707865,7,"Maybe, test",No pref.,Cyprus,area < 250k,Maybe
26603,0.707865,7,"Yes, no new car",Prob. limit traffic,Cyprus,area < 250k,Probably yes


In [19]:
df_save_path = f'{dataset_path}5_DNA_{vars}values_abbr+target.csv'
df_DNAt.to_csv(df_save_path, index=False)

In [20]:
for i, col in enumerate(categorical_cols):
  if col in DNAt_map_abbreviated_columns.keys():
    categorical_cols[i] = DNAt_map_abbreviated_columns[col]
categorical_cols

['EnvImpact', 'CarShare', 'TollsTraffic', 'Country', 'Residence']

In [21]:
for i, col in enumerate(numerical_cols):
  if col in DNAt_map_abbreviated_columns.keys():
    numerical_cols[i] = DNAt_map_abbreviated_columns[col]
numerical_cols

['IUsers']

In [22]:
target_col = DNAt_map_abbreviated_columns[target_col]
target_col

'EVs'

In [23]:
# select the top categories for columns with a lot of categories
cols_top_only = [column for column in categorical_cols if len(df_DNAt[column].unique()) > 11]
cols_top_only

['Country']

### Create numerical version for ordering

In [25]:
def getBarsValues(n):
  # 1/2n is bar width
  return [(i+1/2)/n for i in range(n)]
getBarsValues(3)

[0.16666666666666666, 0.5, 0.8333333333333334]

In [26]:
df_DNAt_order = df_DNAt.copy()

In [27]:
replace_arr = {}
for fixed_order_col in fixed_order_col_answers.keys():
  print(fixed_order_col)
  replace_arr[fixed_order_col] = {}
  barsValues = getBarsValues(len(fixed_order_col_answers[fixed_order_col]))
  for i, value in enumerate(fixed_order_col_answers[fixed_order_col]):
    replace_arr[fixed_order_col][value] = barsValues[i]
numerical_replace_arr = replace_arr
numerical_replace_arr

EnvImpact
CarShare
TollsTraffic
Residence
EVs


{'CarShare': {"Don't know": 0.21428571428571427,
  'Maybe, test': 0.35714285714285715,
  'No': 0.07142857142857142,
  'Yes, already client': 0.9285714285714286,
  'Yes, give up car': 0.7857142857142857,
  'Yes, no car influence': 0.5,
  'Yes, no new car': 0.6428571428571429},
 'EVs': {'Certainly not': 0.08333333333333333,
  'Certainly yes': 0.9166666666666666,
  "Don't know": 0.5833333333333334,
  'Maybe': 0.4166666666666667,
  'Probably not': 0.25,
  'Probably yes': 0.75},
 'EnvImpact': {'1': 0.045454545454545456,
  '10': 0.9545454545454546,
  '2': 0.13636363636363635,
  '3': 0.22727272727272727,
  '4': 0.3181818181818182,
  '5': 0.4090909090909091,
  '6': 0.5909090909090909,
  '7': 0.6818181818181818,
  '8': 0.7727272727272727,
  '9': 0.8636363636363636,
  "Don't know": 0.5},
 'Residence': {'1M < area': 0.875,
  '250k < area < 1M': 0.625,
  'Rural area': 0.125,
  'area < 250k': 0.375},
 'TollsTraffic': {'Def. limit traffic': 0.9,
  'Def. pay': 0.1,
  'No pref.': 0.5,
  'Prob. limit t

In [28]:
df_DNAt_order = df_DNAt_order.replace(numerical_replace_arr)
df_DNAt_order

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,Residence,EVs
0,0.842697,0.409091,0.357143,0.5,Belgium,0.875,0.416667
1,0.797753,0.772727,0.357143,0.7,France,0.375,0.250000
2,0.707865,0.590909,0.071429,0.5,Czech Republic,0.625,0.083333
3,0.955056,0.772727,0.214286,0.3,Sweden,0.875,0.416667
4,0.561798,0.772727,0.357143,0.5,Poland,0.875,0.750000
...,...,...,...,...,...,...,...
26600,0.707865,0.227273,0.642857,0.3,Cyprus,0.125,0.583333
26601,0.707865,0.409091,0.214286,0.5,Cyprus,0.375,0.583333
26602,0.707865,0.681818,0.357143,0.5,Cyprus,0.375,0.416667
26603,0.707865,0.681818,0.642857,0.7,Cyprus,0.375,0.750000


In [29]:
replace_arr = {}
for fixed_order_col in fixed_order_col_answers.keys():
  print(fixed_order_col)
  replace_arr[fixed_order_col] = {}
  barsValues = getBarsValues(len(fixed_order_col_answers[fixed_order_col]))
  for i, value in enumerate(fixed_order_col_answers[fixed_order_col]):
    replace_arr[fixed_order_col][barsValues[i]] = value
inv_numerical_replace_arr = replace_arr
inv_numerical_replace_arr

EnvImpact
CarShare
TollsTraffic
Residence
EVs


{'CarShare': {0.07142857142857142: 'No',
  0.21428571428571427: "Don't know",
  0.35714285714285715: 'Maybe, test',
  0.5: 'Yes, no car influence',
  0.6428571428571429: 'Yes, no new car',
  0.7857142857142857: 'Yes, give up car',
  0.9285714285714286: 'Yes, already client'},
 'EVs': {0.08333333333333333: 'Certainly not',
  0.25: 'Probably not',
  0.4166666666666667: 'Maybe',
  0.5833333333333334: "Don't know",
  0.75: 'Probably yes',
  0.9166666666666666: 'Certainly yes'},
 'EnvImpact': {0.045454545454545456: '1',
  0.13636363636363635: '2',
  0.22727272727272727: '3',
  0.3181818181818182: '4',
  0.4090909090909091: '5',
  0.5: "Don't know",
  0.5909090909090909: '6',
  0.6818181818181818: '7',
  0.7727272727272727: '8',
  0.8636363636363636: '9',
  0.9545454545454546: '10'},
 'Residence': {0.125: 'Rural area',
  0.375: 'area < 250k',
  0.625: '250k < area < 1M',
  0.875: '1M < area'},
 'TollsTraffic': {0.1: 'Def. pay',
  0.3: 'Prob. pay',
  0.5: 'No pref.',
  0.7: 'Prob. limit traff

#### Add weights

In [30]:
orderWeights = {
    'EnvImpact': np.ones(len(df_DNAt['EnvImpact'].unique())),
    'CarShare': np.array([10, 5, 5, 1, 2, 3, 4]),
    'TollsTraffic': np.array([2, 1, 1, 1, 2]),
    'Residence': np.array([3, 1, 1, 2]),
    'EVs': np.array([4, 2, 1, 1, 2, 4])
}
orderWeights

{'CarShare': array([10,  5,  5,  1,  2,  3,  4]),
 'EVs': array([4, 2, 1, 1, 2, 4]),
 'EnvImpact': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'Residence': array([3, 1, 1, 2]),
 'TollsTraffic': array([2, 1, 1, 1, 2])}

In [31]:
for fixed_order_col in fixed_order_col_answers.keys():
  barsValues = df_DNAt_order[fixed_order_col].unique()
  for i, value in enumerate(barsValues):
    df_DNAt_order.loc[df_DNAt_order[fixed_order_col] == value, f'{fixed_order_col}_w'] \
      = orderWeights[fixed_order_col][i]
df_DNAt_order

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,Residence,EVs,EnvImpact_w,CarShare_w,TollsTraffic_w,Residence_w,EVs_w
0,0.842697,0.409091,0.357143,0.5,Belgium,0.875,0.416667,1.0,10.0,2.0,3.0,4.0
1,0.797753,0.772727,0.357143,0.7,France,0.375,0.250000,1.0,10.0,1.0,1.0,2.0
2,0.707865,0.590909,0.071429,0.5,Czech Republic,0.625,0.083333,1.0,5.0,2.0,1.0,1.0
3,0.955056,0.772727,0.214286,0.3,Sweden,0.875,0.416667,1.0,5.0,1.0,3.0,4.0
4,0.561798,0.772727,0.357143,0.5,Poland,0.875,0.750000,1.0,10.0,2.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
26600,0.707865,0.227273,0.642857,0.3,Cyprus,0.125,0.583333,1.0,2.0,1.0,2.0,2.0
26601,0.707865,0.409091,0.214286,0.5,Cyprus,0.375,0.583333,1.0,5.0,2.0,1.0,2.0
26602,0.707865,0.681818,0.357143,0.5,Cyprus,0.375,0.416667,1.0,10.0,2.0,1.0,4.0
26603,0.707865,0.681818,0.642857,0.7,Cyprus,0.375,0.750000,1.0,2.0,1.0,1.0,1.0


## Get cluster labels (link here)

Load cluster labels here from file

In [32]:
vars_linkage_cuts = {5:
                  {
                    "average":[[2,3,5,7,9],[2,3,5,7,9,12]],
                    "complete":[[2,4],[2,4,6],[2,4,6,8]],
                    "weighted":[[2,4,6,8],[2,4,6,8,11]],
                    "single":[[2,4,6,14,19,24,28,33,49,86]]
                  },
                6:
                  {
                    "average": [[2,3,5,7,9]],
                    "complete":[[2,3,5]],
                    "weighted":[[2,4,6,8,11,13]],
                    "single":[[2,4,6,15,19,25,29,31,41,59,75,85,124]]
                  }
}

In [33]:
metric = 'VDM'
target_class = 'multi'
link = 'average'
postprocessing = True

In [34]:
cut_values = vars_linkage_cuts[vars][link][0]
is_postprocessing = '_fix' if postprocessing else ''
cluster_labels_path = f"{HC_base_path}{metric}{vars}/{target_class}/ClusterLabels/ClusterLabels_{metric}_{link}_{str(cut_values)}{is_postprocessing}.npy"
cluster_labels = np.load(cluster_labels_path)

## Add clusters

In [35]:
dfc_DNAt = df_DNAt.assign(cluster=cluster_labels)
dfc_DNAt_order = df_DNAt_order.assign(cluster=cluster_labels)

In [36]:
dfc_DNAt

Unnamed: 0,IUsers,EnvImpact,CarShare,TollsTraffic,Country,Residence,EVs,cluster
0,0.842697,5,"Maybe, test",No pref.,Belgium,1M < area,Maybe,28
1,0.797753,8,"Maybe, test",Prob. limit traffic,France,area < 250k,Probably not,28
2,0.707865,6,No,No pref.,Czech Republic,250k < area < 1M,Certainly not,28
3,0.955056,8,Don't know,Prob. pay,Sweden,1M < area,Maybe,28
4,0.561798,8,"Maybe, test",No pref.,Poland,1M < area,Probably yes,28
...,...,...,...,...,...,...,...,...
26600,0.707865,3,"Yes, no new car",Prob. pay,Cyprus,Rural area,Don't know,28
26601,0.707865,5,Don't know,No pref.,Cyprus,area < 250k,Don't know,28
26602,0.707865,7,"Maybe, test",No pref.,Cyprus,area < 250k,Maybe,28
26603,0.707865,7,"Yes, no new car",Prob. limit traffic,Cyprus,area < 250k,Probably yes,28


# Plot hist

In [177]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px # for colors

def plotScatter(fig, df, x, numerical_col, i, j, fontsize, color):
  x, y = np.unique(x[numerical_col], return_counts=True)
  y = y.astype(float, copy=False)
  y /= np.sum(y).astype(float) # this normalizes as a percentage
  mode = 'lines'
  if len(y) == 1:
    mode = 'markers'
  fig.add_trace(go.Scatter(x=x, y=y,
                  mode=mode,
                  name=numerical_col,
                  showlegend=False,
                  marker_color= color), 
                row=i, col=j,
                )
  fig.update_yaxes(range=[0, 1], row=i, col=j)
  fig.update_xaxes(range=[0, 1], row=i, col=j)

  # The vertical line for average score of all the values
  avg = np.sum(x*y)
  # print(f'avg: {avg}')
  dashes = 100
  axis_line = go.Scatter(x=[avg]*dashes,
                         y=np.linspace(0, 1, dashes,),
                         showlegend=False,
                         name='avg',
                         mode='lines',
                         line=dict(color=color, dash='dash',
                                   width =1) )
  fig.add_trace(axis_line, row=i, col=j)
  return fig

def plotHist(fig, df, x, categorical_col, to_sort_cols, 
             fixed_order_col_answers, i, j, topN, fontsize, color, trend=None):
  # print(i,j)
  sorted_answers = df[categorical_col].unique()
  if (categorical_col in to_sort_cols):
    top_answers = x[categorical_col].value_counts().index[:topN]
    x = x.loc[x[categorical_col].isin(top_answers)]
    sorted_answers = x[categorical_col].value_counts().index
  
  if (categorical_col in fixed_order_col_answers.keys()):
    sorted_answers = fixed_order_col_answers[categorical_col]
    
  #print(df_mean.index[sorted_index[i]])
  fig.add_trace(go.Histogram(x=x[categorical_col],
                  name=categorical_col,
                  histnorm='probability',
                  showlegend=False,
                  marker_color=color),
                row=i, col=j,
                )
  
  if categorical_col in to_sort_cols:
    fig.update_traces(nbinsx=11, row=i, col=j)
  fig.update_yaxes(range=[0, 1], row=i, col=j)
  fig.update_xaxes(type='category',
                    categoryorder="array", 
                    categoryarray=sorted_answers,
                    range=(-0.5,-0.5+len(sorted_answers)),
                    tickangle=60,
                    # 5 is reference number of columns to set fontsize for
                    tickfont=dict(size= \
                      fontsize * (1 + (5 - len(sorted_answers)) / (5 + len(sorted_answers)) / 1.0)),
                    row=i, col=j)
  # if trend:
  #   dashes = 100
  #   trend = 0.5
  #   # trend = -0.5*(1-trend)+(-0.5+len(sorted_answers))*trend
  #   trend_line = go.Scatter(x=[trend]*dashes,
  #                         y=np.linspace(0, 1, dashes,),
  #                         showlegend=False,
  #                         name='trend',
  #                         mode='lines',
  #                         line=dict(color=color, dash='dash',
  #                                   width =1),
  #                         xaxis='x2')
  #   fig.add_trace(trend_line, row=i, col=j)
  #   fig.update_layout(xaxis2={'anchor': 'y', 'overlaying': 'x', 'side': 'top'},
  #                     row=i, col=j)
  #   fig.data[-1].update(xaxis='x2')

  return fig

def plotHists(df, group_col, target_col, numerical_cols, categorical_cols, 
              to_sort_cols, fixed_order_col_answers, cut_values=None, save_path=None, 
              row_height=300, topN=6, showFig=False, 
              nameCharLimit=27, fontsize=12):
  

  # check if plot by clusters
  is_clusters = True if group_col == "cluster" else False

  # Group by group_col attribute
  sorted_index = df.groupby(group_col,as_index=False).count() \
      .sort_values(by=df.columns[0],ascending=False).index
  df_grouped = df.groupby(group_col)
  df_mean = df_grouped.mean()
  
  cluster_counts = df[group_col].value_counts()
  # print(f"cluster sizes: {cluster_counts}")


  k = len(df[group_col].unique())
  # has_outliers = np.isin(-1, df['cluster'].unique())
  # if has_outliers:
  #   k -= 1
  # the +1 for cols is because of target_col
  cols = len(numerical_cols+categorical_cols) + 1
  # the +1 for rows is because of title row and whole dataset row
  rows = k + 1
  # define the titles of each subplot
  # titles = [f"k={k} clusters, merged cuts: {cut_values}"]
  titles = []
  for i in range(rows):
    for numerical_col in numerical_cols:
      titles.append(numerical_col[:nameCharLimit])
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:nameCharLimit])
    titles.append(target_col[:nameCharLimit])
  # define the type of each column for each row
  specs = [[{'type': 'xy'}]*cols]*(rows)
  colors = [px.colors.qualitative.Plotly[i%len(px.colors.qualitative.Plotly)] 
            for i in range(cols)]

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.2/cols,
                      vertical_spacing=0.3/rows,
                      subplot_titles=titles,
                      shared_yaxes=True
                      )
  
  for i in range(1, rows):
    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
  
    # x is the dataset only containing this cluster/answer
    if group_col not in fixed_order_col_answers.keys():
      col_value = df_mean.index[sorted_index[i-1]]
    else:
      col_value = fixed_order_col_answers[group_col][i-1]
    x = df.loc[df[group_col] == col_value]
    for j, numerical_col in enumerate(numerical_cols):
      fig = plotScatter(fig, df, x, numerical_col, i, j+1, fontsize, colors[j])
    j += 1
    for categorical_col in categorical_cols:
      fig = plotHist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, 
                     i, j+1, topN, fontsize, colors[j])
      j += 1
    # now plot target_col
    fig = plotHist(fig, df, x, target_col, to_sort_cols, fixed_order_col_answers, 
                     i, j+1, topN, fontsize, colors[j])
    fig.update_yaxes(
        title_text=f"{col_value} ({cluster_counts[col_value]})", 
        row=i, col=1)
  # Plot mean as last row
  i += 1

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, numerical_col in enumerate(numerical_cols):
    fig = plotScatter(fig, df, x, numerical_col, i, j+1, fontsize, colors[j])
  j += 1
  for categorical_col in categorical_cols:
    fig = plotHist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, 
                   i, j+1, -1, fontsize, colors[j])
    j += 1
  # now plot target_col
  fig = plotHist(fig, df, x, target_col, to_sort_cols, fixed_order_col_answers, 
                    i, j+1, topN, fontsize, colors[j])
  fig.update_yaxes(title_text="Whole Dataset", row=i, col=1)

  # Update layout
  t = f"k={k} clusters"
  if cut_values:
    t += f", merged cuts: {cut_values}"
  if not is_clusters:
    t += f", grouped by: {group_col}"
  fig.update_layout(
      title=t,
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()


In [178]:
# df_mean = dfc_DNAt_order.groupby('cluster').mean()
# df_mean

In [179]:
def getOrderedCats(df_order, group_col='cluster', weighted=True, return_trend=False):
  order = {}
  if return_trend:
    trend = {}
  if group_col == 'cluster':
    df = df_order[df_order[group_col] != -1]
  else:
    df = df_order.drop('cluster', axis=1)
  df_mean = df.groupby(group_col).mean()
  cols = [col for col in df_mean.columns if '_w' not in col]
  if weighted:
    f = {}
    for col in cols:
      if col+'_w' not in df_mean.columns:
        f[col] = 'mean'
      else:
        f[col] = lambda x: np.average(x, weights=df.loc[x.index, col+'_w'].values)
    df_mean = df.groupby(group_col).agg(f)
  # return df_mean
  for col in cols:
    order[col] = np.array(df_mean.sort_values(by=col).index)
    if return_trend:
      trend[col] = np.array(df_mean.sort_values(by=col)[col].values)
    if group_col == 'cluster':
      order[col] = np.append(order[col],-1)
      if return_trend:
        # don't consider the trend of the cluster -1
        trend[col] = np.append(trend[col], 0)
    elif group_col in inv_numerical_replace_arr.keys():
      order_col = list(df_mean.sort_values(by=col).index)
      for i, c in enumerate(order_col):
        order_col[i] = inv_numerical_replace_arr[group_col][c]
      order[col] = np.array(order_col)
  if return_trend:
    return order, trend
  return order
orderCats, trendCats = getOrderedCats(dfc_DNAt_order, group_col='Country', weighted=True,
                           return_trend=True)
trendCats

{'CarShare': array([0.2242 , 0.23757, 0.2562 , 0.25932, 0.26083, 0.26431, 0.29022,
        0.29128, 0.29288, 0.30222, 0.31393, 0.31763, 0.31865, 0.31964,
        0.32317, 0.32746, 0.33114, 0.33535, 0.34351, 0.3567 , 0.36839,
        0.37208, 0.37401, 0.37671, 0.38197, 0.38503, 0.39411, 0.40012]),
 'EVs': array([0.4191 , 0.42033, 0.42082, 0.42383, 0.43077, 0.43739, 0.4412 ,
        0.45846, 0.45907, 0.47247, 0.47831, 0.49557, 0.4978 , 0.49844,
        0.50703, 0.51104, 0.51218, 0.51521, 0.52456, 0.52787, 0.53437,
        0.53456, 0.53516, 0.5541 , 0.56294, 0.5761 , 0.60227, 0.61244]),
 'EnvImpact': array([0.50494, 0.51061, 0.54177, 0.5677 , 0.57583, 0.57963, 0.58478,
        0.58758, 0.59838, 0.60441, 0.60697, 0.61104, 0.62127, 0.62139,
        0.62715, 0.63004, 0.64159, 0.64351, 0.657  , 0.6598 , 0.67151,
        0.6733 , 0.69063, 0.70174, 0.71129, 0.72703, 0.72984, 0.74707]),
 'IUsers': array([0.48016, 0.52187, 0.59213, 0.61994, 0.62788, 0.62921, 0.67416,
        0.67416, 0.68361, 0.6

Real Mean

{'CarShare': Int64Index([5, 11, 28, 27, 1, 20], dtype='int64', name='cluster'),
 'EVs': Int64Index([5, 28, 11, 27, 1, 20], dtype='int64', name='cluster'),
 'EnvImpact': Int64Index([5, 11, 28, 27, 1, 20], dtype='int64', name='cluster'),
 'IUsers': Int64Index([27, 20, 1, 11, 5, 28], dtype='int64', name='cluster'),
 'Residence': Int64Index([28, 11, 5, 27, 1, 20], dtype='int64', name='cluster'),
 'TollsTraffic': Int64Index([1, 11, 28, 5, 27, 20], dtype='int64', name='cluster')}

 Weighted Mean

 {'CarShare': Int64Index([5, 11, 28, 27, 20, 1], dtype='int64', name='cluster'),
 'EVs': Int64Index([5, 28, 11, 27, 1, 20], dtype='int64', name='cluster'),
 'EnvImpact': Int64Index([5, 11, 28, 27, 1, 20], dtype='int64', name='cluster'),
 'IUsers': Int64Index([27, 20, 1, 11, 5, 28], dtype='int64', name='cluster'),
 'Residence': Int64Index([5, 28, 11, 27, 1, 20], dtype='int64', name='cluster'),
 'TollsTraffic': Int64Index([1, 11, 28, 5, 27, 20], dtype='int64', name='cluster')}

In [180]:
def plotHistsT(df, group_row, target_row, numerical_rows, categorical_rows, 
              to_sort_rows, fixed_order_row_answers, cut_values=None, 
              save_path=None, row_height=300, topN=6, showFig=False, 
              nameCharLimit=27, fontsize=12, df_order=None):
  

  # check if plot by clusters
  is_clusters = True if group_row == "cluster" else False

  # Group by group_row attribute
  sorted_index = df.groupby(group_row,as_index=False).count() \
      .sort_values(by=df.columns[0],ascending=False).index
  df_grouped = df.groupby(group_row)
  df_mean = df_grouped.mean()
  
  cluster_counts = df[group_row].value_counts()
  # print(f"cluster sizes: {cluster_counts}")


  k = len(df[group_row].unique())
  # has_outliers = np.isin(-1, df['cluster'].unique())
  # if has_outliers:
  #   k -= 1
  # the +1 for rows is because of target_row
  rows = len(numerical_rows+categorical_rows) + 1
  # the +1 for cols is because of title col and whole dataset col
  cols = k + 1
  # [orderWeights] is a category order, and for numerical variables can be
  # ascending or descending
  if df_order is not None:
    # figure out order of plotting
    orderCats = \
      getOrderedCats(df_order, group_col=group_row)
    trend = None
    print(orderCats)
  else:
    orderCats = {}
  # define the titles of each subplot
  # titles = [f"k={k} clusters, merged cuts: {cut_values}"]
  titles = []
  for row in numerical_rows+categorical_rows+[target_row]:
    for i in range(cols-1):
      if row in orderCats.keys():
        col_value = orderCats[row][i]
      elif group_row not in fixed_order_row_answers.keys():
        col_value = df_mean.index[sorted_index[i]]
      else:
        col_value = fixed_order_row_answers[group_row][i]
      br_odd = ''
      if k>11 and i%2 == 1:
        # visibility issues fix here:
        br_odd = '<br>'
      titles.append(f"{col_value}{br_odd}<br>({cluster_counts[col_value]})")
    titles.append('Whole dataset')
  # define the type of each column for each row
  specs = [[{'type': 'xy'}]*cols]*(rows)
  colors = [px.colors.qualitative.Plotly[i%len(px.colors.qualitative.Plotly)] 
            for i in range(rows)]

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.2/cols,
                      vertical_spacing=0.3/rows,
                      subplot_titles=titles,
                      shared_yaxes=True
                      )
  
  for i in range(1, cols):
    # plot the histplot of categorical variables,
    # if the row must be a sorted row, then plot only the topN values
  
    # x is the dataset only containing this cluster/answer
    if group_row not in fixed_order_row_answers.keys():
      col_value = df_mean.index[sorted_index[i-1]]
    else:
      col_value = fixed_order_row_answers[group_row][i-1]
    x = df.loc[df[group_row] == col_value]
    for j, numerical_row in enumerate(numerical_rows):
      col = i if numerical_row not in orderCats.keys() \
        else int(np.argwhere(orderCats[numerical_row]==col_value)[0][0]) + 1
      # trend = None if numerical_row not in trendCats.keys() \
      #   else trendCats[numerical_row][col-1]
      fig = plotScatter(fig, df, x, numerical_row, j+1, col, fontsize, 
                        colors[j]) #, trend=trend)
    j += 1
    for categorical_row in categorical_rows:
      col = i if categorical_row not in orderCats.keys() \
        else int(np.argwhere(orderCats[categorical_row]==col_value)[0][0]) + 1
      # trend = None if categorical_row not in trendCats.keys() \
      #   else trendCats[categorical_row][col-1]
      fig = plotHist(fig, df, x, categorical_row, to_sort_rows, fixed_order_row_answers, 
                     j+1, col, topN, fontsize, colors[j], trend=trend)
      j += 1
    # now plot target_row
    col = i if target_row not in orderCats.keys() \
      else int(np.argwhere(orderCats[target_row]==col_value)[0][0]) + 1
    # trend = None if target_row not in trendCats.keys() \
    #   else trendCats[target_row][col-1]
    fig = plotHist(fig, df, x, target_row, to_sort_rows, fixed_order_row_answers, 
                     j+1, col, topN, fontsize, colors[j])

  # Plot mean as last col
  i += 1

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, numerical_col in enumerate(numerical_rows):
    fig = plotScatter(fig, df, x, numerical_col, j+1, i, fontsize, colors[j])
  j += 1
  for categorical_col in categorical_rows:
    fig = plotHist(fig, df, x, categorical_col, to_sort_rows, fixed_order_row_answers, 
                   j+1, i, -1, fontsize, colors[j])
    j += 1
  # now plot target_row
  fig = plotHist(fig, df, x, target_row, to_sort_rows, fixed_order_row_answers, 
                    j+1, i, topN, fontsize, colors[j])
  
  i = 1
  for numerical_row in numerical_rows:
    fig.update_yaxes(
        title_text=numerical_row, 
        row=i, col=1)
    i += 1
  for categorical_row in categorical_rows:
    fig.update_yaxes(
        title_text=categorical_row, 
        row=i, col=1)
    i += 1
  fig.update_yaxes(
        title_text=target_row, 
        row=i, col=1)

    

  # Update layout
  t = f"k={k} clusters"
  if cut_values:
    t += f", merged cuts: {cut_values}"
  if not is_clusters:
    t += f", grouped by: {group_row}"
  fig.update_layout(
      title=t,
      height=row_height*rows,
      bargap=0.05,
      font=dict(size=fontsize)
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [181]:
def plotAll(vars, save_or_show='show', group_col='cluster', transposed=False):
  is_transposed = '^T' if transposed else ''
  k_cut_values = cut_values
  if group_col == 'cluster':
    filename = f'hist_{metric}{vars}_{link}_{k_cut_values}{is_postprocessing}{is_transposed}{image_file_type}'
    save_path = f'{hists_path}{metric}{vars}/{filename}'
  else:
    filename = f'hist{vars}_by_{group_col}{is_transposed}{image_file_type}'
    save_path = f'{hists_path}{filename}'
    k_cut_values = None
  row_height = 500
  plot_numerical_cols = numerical_cols
  plot_categorical_cols = categorical_cols[:vars-len(numerical_cols)]
  if save_or_show == 'show':
    save_path=None
    showFig=True
  elif save_or_show == 'save':
    showFig=False
  else:
    print('save_or_show must be either "save" or "show"')
  if not transposed:
    plotHists(dfc_DNAt, group_col, target_col, plot_numerical_cols, plot_categorical_cols, 
              cols_top_only, fixed_order_col_answers, cut_values=k_cut_values, save_path=save_path,
              row_height=row_height, topN=10, showFig=showFig)
  else:
    plotHistsT(dfc_DNAt, group_col, target_col, plot_numerical_cols, plot_categorical_cols, 
              cols_top_only, fixed_order_col_answers, cut_values=k_cut_values, save_path=save_path,
              row_height=row_height, topN=10, showFig=showFig, df_order=dfc_DNAt_order)

In [182]:
print(metric, link, vars, is_postprocessing)

VDM average 6 _fix


In [183]:
plotAll(vars, group_col='cluster', transposed=True, save_or_show='show')

{'IUsers': array([27, 20,  1, 11,  5, 28, -1]), 'EnvImpact': array([ 5, 11, 28, 27,  1, 20, -1]), 'CarShare': array([ 5, 11, 28, 27, 20,  1, -1]), 'TollsTraffic': array([ 1, 11, 28,  5, 27, 20, -1]), 'Residence': array([ 5, 28, 11, 27,  1, 20, -1]), 'EVs': array([ 5, 28, 11, 27,  1, 20, -1])}


NameError: ignored

In [None]:
filename = f'hist_{metric}{vars}_{link}_{cut_values}{is_postprocessing}{image_file_type}'
save_path = f'{hists_path}{filename}'
row_height = 400
save_path
#plotClustersRadar(df_radar, column, columns, save_path, plot_cols, row_height)

'/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Pictures/HistCharts/hist_VDM6_weighted_[2, 4, 6, 8, 11, 13]_fix.html'

### Run here

In [214]:
metric="VDM"
file_type = ".npy"
wss_file_type = ".npy"
vars_values = [5,6]
linkages = ["single","complete", "average", "weighted"]
postprocessing_values = [False, True]
transposed_values = [True]
for vars in vars_values:
  print(f'\n\n\nvars: {vars}\n')
  for link in linkages:
    print(f">> link = {link}")
    for cut_values in vars_linkage_cuts[vars][link]:
      print(f">>>> cut_values: {cut_values} ...")
      for postprocessing in postprocessing_values:
        print(f">>>>>> postprocessing = {postprocessing} ")
        is_postprocessing = "_fix" if postprocessing else ""
        cluster_labels_path = f"{HC_base_path}{metric}{vars}/{target_class}/ClusterLabels/" \
          f"ClusterLabels_{metric}_{link}_{str(cut_values)}{is_postprocessing}.npy"
        cluster_labels = np.load(cluster_labels_path)
        dfc_DNAt["cluster"] = cluster_labels
        dfc_DNAt_order["cluster"] = cluster_labels
        for transposed in transposed_values:
          print(f">>>>>>>> transposed = {transposed}")
          plotAll(vars, save_or_show='save', transposed=transposed)




vars: 5

>> link = single
>>>> cut_values: [2, 4, 6, 14, 19, 24, 28, 33, 49, 86] ...
>>>>>> postprocessing = False 
>>>>>>>> transposed = True
{'IUsers': array([60,  2, 13, 83,  4,  9, 14, 59, 42, 21, -1]), 'EnvImpact': array([ 9,  4, 42, 21, 13, 83, 59, 14,  2, 60, -1]), 'CarShare': array([59,  9,  4, 21, 14, 42, 83, 13, 60,  2, -1]), 'TollsTraffic': array([ 2, 59, 83,  4, 42, 14, 21,  9, 13, 60, -1]), 'Residence': array([59, 42, 13,  9,  4, 21, 83,  2, 60, 14, -1]), 'EVs': array([59, 42, 14,  9, 13,  4, 83, 21, 60,  2, -1])}
>>>>>> postprocessing = True 
>>>>>>>> transposed = True
{'IUsers': array([60, 83,  2, 13,  4,  9, 59, 42, -1]), 'EnvImpact': array([ 9,  4, 42, 83, 13, 59,  2, 60, -1]), 'CarShare': array([ 9, 42,  4, 59, 13, 83, 60,  2, -1]), 'TollsTraffic': array([ 2, 59,  4, 42, 83,  9, 13, 60, -1]), 'Residence': array([13,  9,  4, 42, 59, 83, 60,  2, -1]), 'EVs': array([42,  9, 13,  4, 59, 83,  2, 60, -1])}
>> link = complete
>>>> cut_values: [2, 4] ...
>>>>>> postprocess

In [245]:
for vars in vars_values:
  for transposed in transposed_values:
    for group_col in categorical_cols[:vars-len(numerical_cols)]:
      print(group_col)
      plotAll(vars, group_col=group_col, save_or_show='save', transposed=transposed)
    plotAll(vars, group_col=target_col, save_or_show='save', transposed=transposed)

EnvImpact
{'IUsers': array(['10', '9', '8', '5', '3', '7', '4', '6', "Don't know", '2', '1'],
      dtype='<U10'), 'CarShare': array(['1', "Don't know", '3', '2', '6', '4', '5', '7', '8', '9', '10'],
      dtype='<U10'), 'TollsTraffic': array(["Don't know", '2', '1', '5', '3', '6', '7', '4', '8', '9', '10'],
      dtype='<U10'), 'Residence': array(['5', '6', '1', "Don't know", '3', '7', '4', '8', '2', '10', '9'],
      dtype='<U10'), 'EVs': array(['1', '6', '4', '5', '3', "Don't know", '7', '2', '8', '9', '10'],
      dtype='<U10')}
CarShare
{'IUsers': array(['Yes, no new car', 'Yes, give up car', 'Yes, no car influence',
       'Maybe, test', 'Yes, already client', "Don't know", 'No'],
      dtype='<U21'), 'EnvImpact': array(['No', "Don't know", 'Maybe, test', 'Yes, no car influence',
       'Yes, give up car', 'Yes, no new car', 'Yes, already client'],
      dtype='<U21'), 'TollsTraffic': array(['Yes, already client', "Don't know", 'No', 'Yes, no car influence',
       'Yes, no new c

# Plot radar and hist per country

In [None]:
plotAll(group_col='EVs')

Output hidden; open in https://colab.research.google.com to view.

# Plot radar and hist for pairs of clusters

In [None]:
column = "cluster"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
c1 = 48
c2 = 49
filename = f'entire_dataset_clusters({c1},{c2})_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
df_compare = df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c1])].append(df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c2])])
plot_radar_hist(df_compare, column, numerical_cols, categorical_cols, theta, save_path, row_height)

# Plot DNA wrt Target (Multi-class)

In [None]:
def plot_radar_hist_target(df, target_col, numerical_cols, categorical_cols, 
                    to_sort_cols, fixed_order_col_answers, theta, save_path=None, 
                    row_height=300, topN=6, showFig=False, nameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 14


  # Group by group_col attribute
  df_grouped = df.groupby(target_col)
  df_mean = df_grouped.mean()
  
  class_counts = {key: value for key, value in zip(df[target_col].value_counts().index, df[target_col].value_counts())}
  #print(f"class sizes: {class_counts}")

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+1


  # define the titles of each subplot
  titles = []
  target_answer_order = fixed_order_col_answers[target_col]
  for answer in target_answer_order:
    titles.append(f"{answer} ({class_counts[answer]})")
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:nameCharLimit])
  # last row titles
  titles.append("Whole Dataset")
  for categorical_col in categorical_cols:
      titles.append(categorical_col[:nameCharLimit])

  # define the type of each column for each row
  #specs = [[None]+ [{'type': 'scatter', 'colspan': (cols-2)}] + [None]*(cols-2)]
  specs = [[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*(rows)

  fig = make_subplots(rows=rows, cols=cols,
                      specs=specs,
                      horizontal_spacing=0.3/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )

  polar_args = {}
  for i in range(rows-1):
    # plot radar plot of the mean of each numerical variable for a 
    # given cluster/class
    r = df_mean[df_mean.index.str.startswith(target_answer_order[i])].values[0]
    fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

    # plot the histplot of categorical variables,
    # if the column must be a sorted column, then plot only the topN values
    x = df.loc[df[target_col] == target_answer_order[i]]
    for j, categorical_col in enumerate(categorical_cols):
      fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, topN, fontsize, marker_colors)
        
  # Plot mean as last row
  i += 1
  # plot radar plot of the mean of each numerical variable of entire dataset
  r = df.mean()
  fig, polar_args = plot_radar(fig, polar_args, r, theta, i,)

  # plot the histplot of categorical variables,
  # if the column must be a sorted column, then plot only the topN values
  x = df
  for j, categorical_col in enumerate(categorical_cols):
    fig = plot_hist(fig, df, x, categorical_col, to_sort_cols, fixed_order_col_answers, i, j, -1, fontsize, marker_colors)

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      #showlegend=True,
      legend=dict(
          x=(cols-1)/cols,
          y=1,
          traceorder="normal",
          font=dict(
              family="sans-serif",
              size=fontsize+2,
              color="black"
              ),
      ),
      bargap=0.05,
      font=dict(size=fontsize),
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['Certainly not', 'Probably not', 'Maybe yes maybe not', "Don't know/no answer", 'Probably yes', 'Certainly yes']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_multiclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)

# Plot DNA wrt Target (Binary-class)

In [None]:
df_2 = df[df[target_col] != 'Maybe yes maybe not']
df_2 = df_2[df_2[target_col] != "Don't know/no answer"]

target_map = {
    "Probably yes": "YES",
    "Certainly yes": "YES",
    "Probably not": "NO",
    "Certainly not": "NO",
}
df_2_fin = df_2.copy()
df_2_fin[target_col] = df_2[target_col].replace(target_map)

In [None]:
column = target_col
fixed_order_col_answers = {
  'Would_subsribe_car_sharing_if_available': ['No', "Don't know", 'Maybe, test', 'Yes, no car influence', 'Yes, no new car', 'Yes, give up car', 'Yes, already client'],
  'Preference_tolls_or_traffic_limitation': ['Def. pay', 'Prob. pay', 'No pref.', 'Prob. limit traffic', 'Def. limit traffic'],
  'Considering_electric_or_hybrid_vehicle_next_purchase': ['NO', 'YES']
  }
theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = f'radar_hist_binaryclass_target.html'
save_path = f'{radars_path}{filename}'
row_height = 500
plot_categorical_cols = []
plot_categorical_cols += categorical_cols
plot_categorical_cols += additioncal_categorical_cols
sorted_cols = additioncal_categorical_cols
plot_radar_hist_target(df_2_fin, column, numerical_cols, plot_categorical_cols, sorted_cols, fixed_order_col_answers, theta, save_path,
                row_height, topN=10)