In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
%matplotlib inline
import sys
sys.path.append("/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code")
np.set_printoptions(precision=5, suppress=True)

In [None]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/4_DNA_values_v1.csv'

df_DNA = pd.read_csv(ds_path)

In [None]:
df_DNA[:3].to_numpy()

array([[0.75862, 0.5    , 0.59   , 0.33333, 0.     , 0.     ],
       [0.68966, 0.8    , 0.11   , 0.33333, 0.25   , 1.     ],
       [0.55172, 0.6    , 0.59   , 0.16667, 0.     , 1.     ]])

In [None]:
ds_postprocessing_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/5_postprocessing_clusters.csv'

df_postprocessing_clusters = pd.read_csv(ds_postprocessing_path)

In [None]:
df_postprocessing_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,cluster
0,0.758621,0.5,0.59,0.333333,0.00,0.0,4
1,0.689655,0.8,0.11,0.333333,0.25,1.0,1
2,0.551724,0.6,0.59,0.166667,0.00,1.0,9
3,0.327586,0.8,0.35,0.333333,0.00,1.0,2
4,0.758621,0.8,0.00,0.166667,0.25,0.0,11
...,...,...,...,...,...,...,...
15223,0.551724,0.6,0.43,0.166667,0.00,1.0,9
15224,0.551724,0.4,0.51,0.333333,0.25,0.0,3
15225,0.551724,0.3,0.43,0.166667,0.00,1.0,9
15226,0.551724,0.6,0.26,0.333333,1.00,0.0,8


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plotClustersRadar(df, column, columns, save_path=None, plot_cols=5, row_height=300):
  titles = [f"{i}" for i in df.index]
  cols = plot_cols
  rows = int(np.ceil(len(df.index)/cols))
  #print(f"rows: {rows}, cols: {cols}")
  fig = make_subplots(rows=rows, cols=cols,
                      specs=[[{'type': 'polar'}]*cols]*rows,
                      horizontal_spacing=0.5/cols,
                      vertical_spacing=0.3/rows,
                      subplot_titles=titles,
                      )
  polar_args = {}
  for i in range(rows):
    for j in range(cols):
      #print(f"i: {i+1}, j: {j+1}")
      if i*cols+j < len(df.index):
        fig.add_trace(go.Scatterpolar(
                            r=df.iloc[i*cols+j],
                            theta=columns,
                            fill='toself',
                            name=f'Class {df.index[i*cols+j]}'), 
                        row=i+1, col=j+1
                      )
        polar_args[f"polar{i*cols+j+1}"] = dict(
                                              radialaxis=dict(
                                                visible=True,
                                                range=[0.0, 1.0]
                                              )
                                            )
  fig.update_layout(
      height=row_height*rows,
      showlegend=False,
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  fig.show()

# Postprocessing dataset

In [None]:
column = "cluster"
#columns = df_DNA.columns
columns = ["IUsers", "EnvImpact", "Age", "Gender"]
numerical_cols = ["InternetUsers", "Concern_environmental_impacts", "grouped_Age", "Gender"]
df_radar = df_postprocessing_clusters[numerical_cols+["cluster"]]
df_radar = df_radar.groupby(column).mean()
radars_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Pictures/DNA_values_v1/RadarCharts/'
filename = 'postprocessing_clusters.html'
save_path = f'{radars_path}{filename}'
plotClustersRadar(df_radar, column, columns, save_path)

# Entire dataset

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

In [None]:
link = 'ward'
vars = 6
HC_path = f'/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/Data/DNA_values_v1/HC/HEOM{vars}/HC_HEOM_{link}.npy'

In [None]:
metric = 'HEOM'
Z = {}
Z[metric] = np.load(HC_path)

In [None]:
metric = "HEOM"
k = 53
cluster = fcluster(Z[metric], k , criterion='maxclust')
df_DNA_clusters = df_DNA.copy()
df_DNA_clusters["cluster"] = cluster

In [None]:
column = "cluster"
#columns = df_DNA.columns
columns = ["IUsers", "EnvImpact", "Age", "Gender"]
numerical_cols = ["InternetUsers", "Concern_environmental_impacts", "grouped_Age", "Gender"]
df_DNA_numerical = df_DNA_clusters[numerical_cols+["cluster"]]
df_DNA_grouped = df_DNA_numerical.groupby(column)
df_radar = df_DNA_grouped.mean()
filename = 'entire_dataset_clusters.html'
save_path = f'{radars_path}{filename}'
plotClustersRadar(df_radar, column, columns, save_path)

# Radar Chart of Entire dataset wrt target

In [None]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_filtered_values.csv'

df = pd.read_csv(ds_path)

In [None]:
cleanup_nums = {"Concern_environmental_impacts":
                  {
                      '1': 1,
                      '2': 2,
                      '3': 3,
                      '4': 4,
                      '5': 5,
                      '6': 6,
                      '7': 7,
                      '8': 8,
                      '9': 9,
                      '10': 10,
                      "Don't know": 0,
                  },
                "grouped_Age":
                  {
                      '16:23': 19.5,
                      '23:27': 25,
                      '27:30': 28.5,
                      '30:35': 32.5,
                      '35:39': 37,
                      '39:43': 41,
                      '43:47': 45,
                      '47:51': 49,
                      '51:54': 52.5,
                      '54:85': 69.5
                  },
                "Would_subsribe_car_sharing_if_available":
                  {
                      "Don't know / No answer": 0,
                      'No, I would not be interested in this service': 1,
                      'Maybe yes, maybe not. I would need to test the service before taking a decision': 2,
                      'Yes without any influence on my car ownership': 3,
                      'Yes, instead of purchasing a new car': 4,
                      'Yes and I would give up one car I currently own': 5,
                      "Yes I'm already client of a car sharing service": 6
                  },
                "Preference_tolls_or_traffic_limitation":
                  {
                      'No preferences': 0,
                      'Probably more acceptable to limit road traffic': 1,
                      'Probably more acceptable to pay for less congestion': 2,
                      'Definitely more acceptable to pay for less congestion': 3,
                      'Definitely more acceptable to limit road traffic': 4
                  },
                "Gender":
                  {
                      'Female': 0,
                      'Male': 1
                  },
                #"Considering_electric_or_hybrid_vehicle_next_purchase":
                #  {
                #      "Don't know/no answer":0,
                #      'Certainly not':1,
                #      'Probably not':2,                   
                #      'Maybe yes maybe not':3,
                #      'Probably yes':4,
                #      'Certainly yes':5
                #  }
                }

In [None]:
categorical_map = {"Would_subsribe_car_sharing_if_available":
                  {
                      "Don't know / No answer": 0,
                      'No, I would not be interested in this service': 1,
                      'Maybe yes, maybe not. I would need to test the service before taking a decision': 2,
                      'Yes without any influence on my car ownership': 3,
                      'Yes, instead of purchasing a new car': 4,
                      'Yes and I would give up one car I currently own': 5,
                      "Yes I'm already client of a car sharing service": 6
                  },
                "Preference_tolls_or_traffic_limitation":
                  {
                      'No preferences': 0,
                      'Probably more acceptable to limit road traffic': 1,
                      'Probably more acceptable to pay for less congestion': 2,
                      'Definitely more acceptable to pay for less congestion': 3,
                      'Definitely more acceptable to limit road traffic': 4
                  }}


In [None]:
numerical_cols = ["InternetUsers", "Concern_environmental_impacts", "grouped_Age", "Gender"]
categorical_cols = ["Would_subsribe_car_sharing_if_available", "Preference_tolls_or_traffic_limitation"]
target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
country_answers = df["Country"]
df = df[numerical_cols+categorical_cols+[target_col]]
df_numerical = df[numerical_cols]

In [None]:
reverse_categorical_map = {}
for key in categorical_cols:
  reverse_categorical_map[key] = {value : key for (key, value) in categorical_map[key].items()}
reverse_categorical_map

{'Preference_tolls_or_traffic_limitation': {0: 'No preferences',
  1: 'Probably more acceptable to limit road traffic',
  2: 'Probably more acceptable to pay for less congestion',
  3: 'Definitely more acceptable to pay for less congestion',
  4: 'Definitely more acceptable to limit road traffic'},
 'Would_subsribe_car_sharing_if_available': {0: "Don't know / No answer",
  1: 'No, I would not be interested in this service',
  2: 'Maybe yes, maybe not. I would need to test the service before taking a decision',
  3: 'Yes without any influence on my car ownership',
  4: 'Yes, instead of purchasing a new car',
  5: 'Yes and I would give up one car I currently own',
  6: "Yes I'm already client of a car sharing service"}}

In [None]:
categorical_map_abbreviated = {"Would_subsribe_car_sharing_if_available":
                {
                    "Don't know / No answer": "Don't know",
                    'No, I would not be interested in this service': "No",
                    'Maybe yes, maybe not. I would need to test the service before taking a decision': "Maybe, test",
                    'Yes without any influence on my car ownership': "Yes, no car influence",
                    'Yes, instead of purchasing a new car': "Yes, no new car",
                    'Yes and I would give up one car I currently own': "Yes, give up car",
                    "Yes I'm already client of a car sharing service": "Yes, already client"
                },
              "Preference_tolls_or_traffic_limitation":
                {
                    'No preferences': "No pref.",
                    'Probably more acceptable to limit road traffic': "Prob. limit traffic",
                    'Probably more acceptable to pay for less congestion': "Prob. pay",
                    'Definitely more acceptable to pay for less congestion': "Def. pay",
                    'Definitely more acceptable to limit road traffic': "Def. limit traffic"
                }
}

In [None]:
df = df.replace(cleanup_nums)

In [None]:
x = df[categorical_cols].values
categorical_min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = categorical_min_max_scaler.fit_transform(x)
df_categorical = pd.DataFrame(x_scaled)
df_categorical.columns = categorical_cols
df_categorical

Unnamed: 0,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation
0,0.333333,0.00
1,0.333333,0.25
2,0.166667,0.00
3,0.000000,0.50
4,0.333333,0.00
...,...,...
26600,0.666667,0.50
26601,0.000000,0.00
26602,0.333333,0.00
26603,0.666667,0.25


In [None]:
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,cluster
0,0.758621,0.5,0.59,0.333333,0.00,0.0,4
1,0.689655,0.8,0.11,0.333333,0.25,1.0,1
2,0.551724,0.6,0.59,0.166667,0.00,1.0,9
3,0.931034,0.8,1.00,0.000000,0.50,1.0,37
4,0.327586,0.8,0.35,0.333333,0.00,1.0,2
...,...,...,...,...,...,...,...
26600,0.551724,0.3,0.18,0.666667,0.50,0.0,45
26601,0.551724,0.5,0.26,0.000000,0.00,0.0,36
26602,0.551724,0.7,0.26,0.333333,0.00,1.0,2
26603,0.551724,0.7,0.35,0.666667,0.25,1.0,40


In [None]:
df_DNA_clusters[categorical_cols] = categorical_min_max_scaler.inverse_transform(df_DNA_clusters[categorical_cols])
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,cluster
0,0.758621,0.5,0.59,2.0,0.0,0.0,4
1,0.689655,0.8,0.11,2.0,1.0,1.0,1
2,0.551724,0.6,0.59,1.0,0.0,1.0,9
3,0.931034,0.8,1.00,0.0,2.0,1.0,37
4,0.327586,0.8,0.35,2.0,0.0,1.0,2
...,...,...,...,...,...,...,...
26600,0.551724,0.3,0.18,4.0,2.0,0.0,45
26601,0.551724,0.5,0.26,0.0,0.0,0.0,36
26602,0.551724,0.7,0.26,2.0,0.0,1.0,2
26603,0.551724,0.7,0.35,4.0,1.0,1.0,40


In [None]:
df_DNA_clusters = df_DNA_clusters.replace(reverse_categorical_map)

In [None]:
df_DNA_clusters = df_DNA_clusters.replace(categorical_map_abbreviated)

In [None]:
df_DNA_clusters

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,cluster
0,0.758621,0.5,0.59,"Maybe, test",No pref.,0.0,4
1,0.689655,0.8,0.11,"Maybe, test",Prob. limit traffic,1.0,1
2,0.551724,0.6,0.59,No,No pref.,1.0,9
3,0.931034,0.8,1.00,Don't know,Prob. pay,1.0,37
4,0.327586,0.8,0.35,"Maybe, test",No pref.,1.0,2
...,...,...,...,...,...,...,...
26600,0.551724,0.3,0.18,"Yes, no new car",Prob. pay,0.0,45
26601,0.551724,0.5,0.26,Don't know,No pref.,0.0,36
26602,0.551724,0.7,0.26,"Maybe, test",No pref.,1.0,2
26603,0.551724,0.7,0.35,"Yes, no new car",Prob. limit traffic,1.0,40


In [None]:
df = df.replace(reverse_categorical_map).replace(categorical_map_abbreviated)

In [None]:
x = df[numerical_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_numerical = pd.DataFrame(x_scaled)
df_numerical.columns = numerical_cols

In [None]:
df[numerical_cols] = df_numerical
df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Gender,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Considering_electric_or_hybrid_vehicle_next_purchase
0,0.758621,0.5,0.59,0.0,"Maybe, test",No pref.,Maybe yes maybe not
1,0.689655,0.8,0.11,1.0,"Maybe, test",Prob. limit traffic,Probably not
2,0.551724,0.6,0.59,1.0,No,No pref.,Certainly not
3,0.931034,0.8,1.00,1.0,Don't know,Prob. pay,Maybe yes maybe not
4,0.327586,0.8,0.35,1.0,"Maybe, test",No pref.,Probably yes
...,...,...,...,...,...,...,...
26600,0.551724,0.3,0.18,0.0,"Yes, no new car",Prob. pay,Don't know/no answer
26601,0.551724,0.5,0.26,0.0,Don't know,No pref.,Don't know/no answer
26602,0.551724,0.7,0.26,1.0,"Maybe, test",No pref.,Maybe yes maybe not
26603,0.551724,0.7,0.35,1.0,"Yes, no new car",Prob. limit traffic,Probably yes


In [None]:
column = target_col
df_numerical[column] = df[column]

columns = ["IUsers", "EnvImpact", "Age", "Gender"]
df_DNA_grouped = df_numerical.groupby(column)
df_radar = df_DNA_grouped.mean()
filename = 'entire_dataset_target.html'
save_path = f'{radars_path}{filename}'
plot_cols = 3
row_height = 400
plotClustersRadar(df_radar, column, columns, save_path, plot_cols, row_height)

In [None]:
df_DNA_clusters.groupby("cluster").count().sort_values(by=df_DNA_clusters.columns[0],ascending=False).index

Int64Index([ 3,  2,  4,  9, 10,  1, 11,  8,  7, 12,  6,  5, 16, 15, 36, 14, 13,
            17, 20, 18, 33, 28, 50, 19, 41, 24, 25, 40, 31, 23, 21, 49, 42, 43,
            39, 47, 32, 53, 22, 26, 48, 44, 46, 51, 38, 27, 45, 35, 37, 34, 52,
            30, 29],
           dtype='int64', name='cluster')

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_radar_hist(df, group_col, numerical_cols, categorical_cols, theta, save_path=None, row_height=300):
  # # Rescale the numerical values
  # df_numerical = df[numerical_cols]
  # x = df_numerical.values
  # min_max_scaler = preprocessing.MinMaxScaler()
  # x_scaled = min_max_scaler.fit_transform(x)
  # df_numerical = pd.DataFrame(x_scaled)
  # #df_numerical.columns = numerical_cols
  # df[numerical_cols] = df_numerical

  # Group by group_col attribute
  sorted_index = df.groupby(group_col,as_index=False).count().sort_values(by=df.columns[0],ascending=False).index
  print(sorted_index)
  df_grouped = df.groupby(group_col)
  df_mean = df_grouped.mean()
  
  titles = []
  for i in sorted_index:
    #print(df_mean.index[i])
    titles.append(str(df_mean.index[i]))
    for categorical_col in categorical_cols:
      titles.append(categorical_col[:20])

  #titles = [f"{i}" for i in df_mean.index]
  cols = len(categorical_cols)+1
  rows = len(df_mean.index)+1
  #print(f"rows: {rows}, cols: {cols}")
  fig = make_subplots(rows=rows, cols=cols,
                      specs=[[{'type': 'polar'}] + [{'type': 'xy'}]*(cols-1)]*rows,
                      horizontal_spacing=0.5/cols,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )
  polar_args = {}
  for i in range(rows-1):
      #print(f"i: {i}")
      fig.add_trace(go.Scatterpolar(
                          r=df_mean.iloc[sorted_index[i]],
                          theta=theta,
                          fill='toself'), 
                      row=i+1, col=1
                    )
      polar_args[f"polar{i+1}"] = dict(radialaxis=dict(visible=True,
                                                         range=[0.0, 1.0]
                                                         )
      )
      for j, categorical_col in enumerate(categorical_cols):
        #print(df_mean.index[sorted_index[i]])
        if j == 0:
          marker_color = '#eb4034'
        elif j == 1:
          marker_color = "#346beb"
        else:
          marker_color = "#32a838"
        fig.add_trace(go.Histogram(x=df.loc[df[group_col] == df_mean.index[sorted_index[i]], categorical_col],
                                   name=categorical_col,
                                   histnorm='probability',
                                   marker_color=marker_color), 
                      row=i+1, col=j+2,
                      )
        fig.update_yaxes(range=[0, 1], row=i+1, col=j+2)
        fig.update_xaxes(categoryorder="array", 
                         categoryarray=df[categorical_col].unique(),
                         autorange=False,
                         tickangle=90,
                         row=i+1, col=j+2)
        
  # Plot mean as last row
  i += 1
  fig.add_trace(go.Scatterpolar(
                          r=df.mean(),
                          theta=theta,
                          fill='toself',
                          ), 
                      row=i+1, col=1
                    )
  polar_args[f"polar{i+1}"] = dict(radialaxis=dict(visible=True,
                                                      range=[0.0, 1.0]
                                                      )
  )
  for j, categorical_col in enumerate(categorical_cols):
    if j == 0:
      marker_color = '#eb4034'
    elif j == 1:
      marker_color = "#346beb"
    else:
      marker_color = "#32a838"
    fig.add_trace(go.Histogram(x=df.loc[:, categorical_col],
                               name=categorical_col,
                               histnorm='probability',
                               marker_color=marker_color), 
                  row=i+1, col=j+2
                  )
    fig.update_yaxes(range=[0, 1], row=i+1, col=j+2)
    fig.update_xaxes(categoryorder="array", 
                     categoryarray=df[categorical_col].unique(), 
                     tickangle=90,
                     row=i+1, col=j+2)

  fig.update_layout(
      height=row_height*rows,
      showlegend=False,
      bargap=0.05,
      **polar_args
  )

  if save_path:
    fig.write_html(save_path)
  fig.show()

In [None]:
column = target_col

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = 'entire_dataset_target_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 400
plot_radar_hist(df, column, numerical_cols, categorical_cols, theta, save_path, row_height)

Int64Index([3, 5, 4, 1, 0, 2], dtype='int64')


In [None]:
df_2 = df[df[target_col] != 'Maybe yes maybe not']
df_2 = df_2[df_2[target_col] != "Don't know/no answer"]

In [None]:
target_map = {
    "Probably yes": "YES",
    "Certainly yes": "YES",
    "Probably not": "NO",
    "Certainly not": "NO",
}
df_2_fin = df_2.copy()
df_2_fin[target_col] = df_2[target_col].replace(target_map)

In [None]:
column = target_col

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = 'entire_dataset_2target_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
plot_radar_hist(df_2_fin, column, numerical_cols, categorical_cols, theta, save_path, row_height)

Int64Index([1, 0], dtype='int64')


In [None]:
column = "cluster"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = 'entire_dataset_clusters_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
plot_radar_hist(df_DNA_clusters, column, numerical_cols, categorical_cols, theta, save_path, row_height)

Int64Index([ 2,  1,  3,  8,  9,  0, 10,  7,  6, 11,  5,  4, 15, 14, 35, 13, 12,
            16, 19, 17, 32, 27, 49, 18, 40, 23, 24, 39, 30, 22, 20, 48, 41, 42,
            38, 46, 31, 52, 21, 25, 47, 43, 45, 50, 37, 26, 44, 34, 36, 33, 51,
            29, 28],
           dtype='int64')


# Plot DNA for each country

In [None]:
var = "Country"
df[var] = country_answers

In [None]:
column = var

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
filename = 'entire_dataset_country_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
plot_radar_hist(df, column, numerical_cols, categorical_cols+[target_col], theta, save_path, row_height)

Int64Index([13, 23, 22, 12,  6,  0,  8,  7, 27,  3,  9, 10, 26, 25, 24,  2, 21,
            20, 14, 17, 16, 15,  1,  5, 11, 18, 19,  4],
           dtype='int64')


# Postprocessing - Merge clusters

In [None]:
df_DNA_clusters.iloc[26519:26536]

Unnamed: 0,InternetUsers,Concern_environmental_impacts,grouped_Age,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,cluster
26519,0.551724,0.0,0.35,No,Prob. limit traffic,0.0,11
26520,0.551724,1.0,0.26,"Yes, give up car",Def. pay,1.0,29
26521,0.551724,0.7,0.35,Don't know,Prob. pay,0.0,37
26522,0.551724,0.5,0.26,No,Def. limit traffic,1.0,16
26523,0.551724,0.9,0.18,"Yes, no car influence",No pref.,1.0,20
26524,0.551724,0.8,1.0,"Maybe, test",Prob. limit traffic,1.0,1
26525,0.551724,0.8,1.0,"Yes, no car influence",No pref.,0.0,19
26526,0.551724,0.7,0.26,"Maybe, test",Prob. pay,1.0,5
26527,0.551724,0.3,1.0,"Yes, give up car",No pref.,0.0,50
26528,0.551724,0.7,0.26,"Maybe, test",No pref.,1.0,2


In [None]:
column = "cluster"

theta = ["IUsers", "EnvImpact", "Age", "Gender"]
c1 = 48
c2 = 49
filename = f'entire_dataset_clusters({c1},{c2})_hist.html'
save_path = f'{radars_path}{filename}'
row_height = 450
df_compare = df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c1])].append(df_DNA_clusters.loc[df_DNA_clusters["cluster"].isin([c2])])
plot_radar_hist(df_compare, column, numerical_cols, categorical_cols, theta, save_path, row_height)

Int64Index([1, 0], dtype='int64')


In [None]:
categorical_map_abbreviated

{'Preference_tolls_or_traffic_limitation': {'Definitely more acceptable to limit road traffic': 'Def. limit traffic',
  'Definitely more acceptable to pay for less congestion': 'Def. pay',
  'No preferences': 'No pref.',
  'Probably more acceptable to limit road traffic': 'Prob. limit traffic',
  'Probably more acceptable to pay for less congestion': 'Prob. pay'},
 'Would_subsribe_car_sharing_if_available': {"Don't know / No answer": "Don't know",
  'Maybe yes, maybe not. I would need to test the service before taking a decision': 'Maybe, test',
  'No, I would not be interested in this service': 'No',
  "Yes I'm already client of a car sharing service": 'Yes, already client',
  'Yes and I would give up one car I currently own': 'Yes, give up car',
  'Yes without any influence on my car ownership': 'Yes, no car influence',
  'Yes, instead of purchasing a new car': 'Yes, no new car'}}

In [None]:
df_distr = pd.DataFrame()
var = 'Preference_tolls_or_traffic_limitation'
for cat in df_DNA_clusters[var].unique():
  df_distr[cat] = df_DNA_clusters[df_DNA_clusters[var]==cat].mean()
  # print(cat, df_DNA_clusters[df_DNA_clusters['Would_subsribe_car_sharing_if_available']==cat].mean(), sep='\n', end='\n\n')
df_distr

Unnamed: 0,No pref.,Prob. limit traffic,Prob. pay,Def. pay,Def. limit traffic
InternetUsers,0.650028,0.59953,0.605236,0.575544,0.560551
Concern_environmental_impacts,0.576403,0.670556,0.674337,0.696501,0.710708
grouped_Age,0.423781,0.418588,0.412537,0.412026,0.427146
Gender,0.505782,0.455301,0.515616,0.576716,0.496525
cluster,12.094005,11.904626,16.390984,29.094213,18.561685


In [None]:
df_DNA_clusters[df_DNA_clusters['Gender']==1].mean()

InternetUsers                     0.610424
Concern_environmental_impacts     0.625676
grouped_Age                       0.424363
Gender                            1.000000
cluster                          14.906867
dtype: float64