In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Conda prepare environment

In [13]:
# Unset PYTHONPATH before installing minicoda, otherwise it may cause issues
%env PYTHONPATH=

env: PYTHONPATH=


In [14]:
%%bash
MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
MINICONDA_PREFIX=/usr/local
wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
chmod +x $MINICONDA_INSTALLER_SCRIPT
./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

Process is interrupted.


In [None]:
%%bash
conda install --channel defaults conda python=3.7 --yes
conda update --channel defaults --all --yes

In [None]:
import sys
_ = (sys.path
        .append("/usr/local/lib/python3.7/site-packages"))

In [None]:
!conda config --append channels conda-forge

!conda create --name scipydev python=3.7 numpy pybind11 cython pythran pytest gfortran_linux-64 gxx_linux-64 sphinx pydata-sphinx-theme sphinx-panels matplotlib mypy git --yes

In [None]:
!conda install conda-build --yes

In [None]:
%%bash
source activate scipydev

conda install pandas scikit-learn --yes

In [None]:
%%bash
source activate scipydev

conda install -c plotly plotly --yes

# Global configuration

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist
from sklearn import preprocessing
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
numpy_file_type = ".npy"
image_file_type = ".html"

radars_path = f'{pictures_path}RadarCharts/'

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

# DNA selection (params here)

## Check region distribution

In [None]:
ds_path = f'{dataset_path}2_grouped_values.csv'

df = pd.read_csv(ds_path)

In [None]:
df.columns

Index(['Country', 'Gender', 'Age', 'Education', 'Region', 'Profession',
       'Work_status', 'Household_members', 'Income_level',
       'Location_of_resudence', 'Centre_or_suburbs',
       'Public_transport_service', 'Car_driving_license',
       'Number_vehicles_in_household',
       'Considering_electric_or_hybrid_vehicle_next_purchase',
       'Know_what_car_sharing_is', 'Would_subsribe_car_sharing_if_available',
       'Most_frequent_trip_Walk', 'Most_frequent_trip_Bicycle',
       'Most_frequent_trip_Car_as_Driver',
       'Most_frequent_trip_Car_as_Passenger', 'Most_frequent_trip_Train',
       'Most_frequent_trip_Underground_or_light_train',
       'Most_frequent_trip_Tram', 'Most_frequent_trip_Bus',
       'Most_frequent_trip_Motorcycle_or_moped',
       'Destination_most_frequent_trip', 'Frequency_most_frequent_trip',
       'Problem_most_frequent_trip_Congestion',
       'Problem_most_frequent_trip_Parking',
       'Problem_most_frequent_trip_Lack_of_bicycle_lanes',
       

In [None]:
print(df.loc[df["Country"] == "Latvia"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Latvia"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Latvia"]["InternetUsers"].value_counts(), sep="")

Riga       322
Pieriga    163
Latgale    149
Kurzeme    131
Zemgale    127
Vidzeme    108
Name: Region, dtype: int64

Latvija    1000
Name: corrected_Region, dtype: int64

66    1000
Name: InternetUsers, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Estonia"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Estonia"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Estonia"]["grouped_Region_3"].value_counts(), sep="")

Pőhja-Eesti    407
Lőuna-Eesti    266
Lääne-Eesti    124
Kirde-Eesti    110
Kesk-Eesti      98
Name: Region, dtype: int64

Eesti    1005
Name: corrected_Region, dtype: int64

EE0    1005
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Lithuania"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Lithuania"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Lithuania"]["grouped_Region_3"].value_counts(), sep="")

Vilniaus apskritis        269
Kauno apskritis           200
Klaipedos apskritis       111
Siauliu apskritis          96
Panevezio apskritis        84
Telsiu apskritis           57
Marijampoles apskritis     57
Utenos apskritis           48
Alytaus apskritis          48
Taurages apskritis         30
Name: Region, dtype: int64

Vidurio ir vakaru Lietuvos regionas    731
Sostines regionas                      269
Name: corrected_Region, dtype: int64

LT0    1000
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Estonia"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Estonia"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Estonia"]["grouped_Region_3"].value_counts(), sep="")

Stockholm              228
Västsverige            202
Sydsverige             173
Östra Mellansverige    155
Smĺland med öarna       67
Norra Mellansverige     66
Övre Norrland           59
Mellersta Norrland      54
Name: Region, dtype: int64

Stockholm              228
Västsverige            202
Sydsverige             173
Östra Mellansverige    155
Småland med öarna       67
Norra Mellansverige     66
Övre Norrland           59
Mellersta Norrland      54
Name: corrected_Region, dtype: int64

SE2    442
SE1    383
SE3    179
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Sweden"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Sweden"]["grouped_Region_4"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Sweden"]["grouped_Region_3"].value_counts(), sep="")

Stockholm              228
Västsverige            202
Sydsverige             173
Östra Mellansverige    155
Smĺland med öarna       67
Norra Mellansverige     66
Övre Norrland           59
Mellersta Norrland      54
Name: Region, dtype: int64

SE11    228
SE23    202
SE22    173
SE12    155
SE21     67
SE31     66
SE33     59
SE32     54
Name: grouped_Region_4, dtype: int64

SE2    442
SE1    383
SE3    179
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Finland"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Finland"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Finland"]["InternetUsers"].value_counts(), sep="")

Länsi-Suomi              279
Helsinki-Uusimaa         267
Pohjois- ja Itä-Suomi    234
Etelä-Suomi              225
Name: Region, dtype: int64

Länsi-Suomi              279
Helsinki-Uusimaa         267
Pohjois- ja Itä-Suomi    234
Etelä-Suomi              225
Name: corrected_Region, dtype: int64

84    1005
Name: InternetUsers, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Belgium"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Belgium"]["corrected_Region"].value_counts(), sep="")

Prov. Antwerpen                                                  161
Prov. Oost-Vlaanderen                                            133
Prov. Hainaut                                                    120
Prov. West-Vlaanderen                                            107
Région de Bruxelles-Capitale / Brussels Hoofdstedelijk Gewest    105
Prov. Ličge                                                       98
Prov. Vlaams-Brabant                                              96
Prov. Limburg (BE)                                                81
Prov. Namur                                                       51
Prov. Brabant Wallon                                              27
Prov. Luxembourg (BE)                                             21
Name: Region, dtype: int64

Prov. Antwerpen                                                161
Prov. Oost-Vlaanderen                                          133
Prov. Hainaut                                                  120
Prov. West-V

In [None]:
print(df.loc[df["Country"] == "Austria"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Austria"]["corrected_Region"].value_counts(), sep="")

Wien                206
Niederösterreich    197
Oberösterreich      169
Steiermark          162
Tirol                78
Salzburg             78
Kärnten              52
Vorarlberg           37
Burgenland (AT)      29
Name: Region, dtype: int64

Wien                206
Niederösterreich    197
Oberösterreich      169
Steiermark          162
Tirol                78
Salzburg             78
Kärnten              52
Vorarlberg           37
Burgenland (AT)      29
Name: corrected_Region, dtype: int64

AT1    432
AT3    362
AT2    214
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Italy"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Italy"]["grouped_Region_4"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Italy"]["grouped_Region_3"].value_counts(), sep="")

Lombardia                              158
Lazio                                  108
Piemonte                                89
Puglia                                  88
Emilia-Romagna                          88
Campania                                77
Sicilia                                 74
Veneto                                  65
Toscana                                 55
Sardegna                                38
Friuli-Venezia Giulia                   31
Calabria                                28
Abruzzo                                 28
Marche                                  21
Liguria                                 18
Basilicata                              12
Umbria                                  10
Provincia Autonoma di Bolzano/Bozen      5
Provincia Autonoma di Trento             5
Molise                                   1
Valle d'Aosta/Vallée d'Aoste             1
Name: Region, dtype: int64

ITC4    158
ITI4    108
ITC1     89
ITH5     88
ITF4     88
ITF3     

In [None]:
print(df.loc[df["Country"] == "Greece"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Greece"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Greece"]["grouped_Region_3"].value_counts(), sep="")

?tt???                        394
???e?a ???ada                 182
???t?                          68
??at????? ?a?ed???a, T?a??     58
?e??p????s??                   57
Tessa??a                       56
??t??? ???ada                  46
??t??? ?a?ed???a               35
Ste?ea ???ada                  33
?pe????                        28
??t?? ???a??                   28
????a ??s?a                    17
???e?? ???a??                  10
Name: Region, dtype: int64

Attiki                  394
Voreia Ellada           303
Kentriki Ellada         209
Nisia Aigaiou, Kriti    106
Name: corrected_Region, dtype: int64

EL3    394
EL5    303
EL6    209
EL4    106
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Spain"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Spain"]["grouped_Region_4"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Spain"]["grouped_Region_3"].value_counts(), sep="")

Andalucía                     179
Cataluńa                      159
Comunidad de Madrid           138
Comunidad Valenciana          118
Castilla y León                59
Galicia                        56
Canarias (ES)                  54
País Vasco                     46
Castilla-la Mancha             41
Principado de Asturias         33
Aragón                         33
Extremadura                    22
Región de Murcia               21
Illes Balears                  19
Comunidad Foral de Navarra     10
Cantabria                       6
La Rioja                        6
Name: Region, dtype: int64

ES61    179
ES51    159
ES3     138
ES52    118
ES41     59
ES11     56
ES7      54
ES21     46
ES42     41
ES12     33
ES24     33
ES43     22
ES62     21
ES53     19
ES22     10
ES23      6
ES13      6
Name: grouped_Region_4, dtype: int64

ES5    296
ES6    200
ES3    138
ES4    122
ES1     95
ES2     95
ES7     54
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Portugal"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Portugal"]["grouped_Region_4"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Portugal"]["grouped_Region_3"].value_counts(), sep="")

Norte                              370
Lisboa                             306
Centro (PT)                        242
Algarve                             31
Alentejo                            29
Regiăo Autónoma da Madeira (PT)     24
Regiăo Autónoma dos Açores (PT)     12
Name: Region, dtype: int64

PT11    370
PT17    306
PT16    242
PT15     31
PT18     29
PT3      24
PT2      12
Name: grouped_Region_4, dtype: int64

PT1    978
PT3     24
PT2     12
Name: grouped_Region_3, dtype: int64


In [None]:
print(df.loc[df["Country"] == "Germany"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Germany"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Germany"]["grouped_Region_3"].value_counts(), sep="")

Düsseldorf                94
Thüringen                 71
Oberbayern                58
Köln                      57
Stuttgart                 50
Hannover                  45
Berlin                    45
Karlsruhe                 41
Mecklenburg-Vorpommern    37
Münster                   36
Rheinhessen-Pfalz         31
Schwaben                  31
Freiburg                  30
Dresden                   30
Leipzig                   27
Chemnitz                  27
Bremen                    23
Hamburg                   22
Weser-Ems                 20
Arnsberg                  19
Mittelfranken             18
Schleswig-Holstein        18
Saarland                  16
Detmold                   15
Unterfranken              15
Sachsen-Anhalt            14
Niederbayern              14
Kassel                    13
Darmstadt                 13
Braunschweig              12
Brandenburg               12
Oberfranken               11
Trier                      9
Oberpfalz                  7
Lüneburg      

In [None]:
print(df.loc[df["Country"] == "Austria"]["Region"].value_counts())
print("\n", df.loc[df["Country"] == "Austria"]["corrected_Region"].value_counts(), sep="")
print("\n", df.loc[df["Country"] == "Austria"]["grouped_Region_3"].value_counts(), sep="")

Wien                206
Niederösterreich    197
Oberösterreich      169
Steiermark          162
Tirol                78
Salzburg             78
Kärnten              52
Vorarlberg           37
Burgenland (AT)      29
Name: Region, dtype: int64

Wien                206
Niederösterreich    197
Oberösterreich      169
Steiermark          162
Tirol                78
Salzburg             78
Kärnten              52
Vorarlberg           37
Burgenland (AT)      29
Name: corrected_Region, dtype: int64

AT1    432
AT3    362
AT2    214
Name: grouped_Region_3, dtype: int64


## Prepare VDM

In [None]:
metric = 'VDM'

In [None]:
ds_path = f'{dataset_path}3_filtered_values.csv'

df = pd.read_csv(ds_path)

In [None]:
df.columns

Index(['Country', 'Gender', 'Education', 'Profession', 'Work_status',
       'Household_members', 'Income_level', 'Location_of_resudence',
       'Centre_or_suburbs', 'Public_transport_service', 'Car_driving_license',
       'Considering_electric_or_hybrid_vehicle_next_purchase',
       'Know_what_car_sharing_is', 'Would_subsribe_car_sharing_if_available',
       'Most_frequent_trip_Walk', 'Most_frequent_trip_Bicycle',
       'Most_frequent_trip_Car_as_Driver',
       'Most_frequent_trip_Car_as_Passenger', 'Most_frequent_trip_Train',
       'Most_frequent_trip_Underground_or_light_train',
       'Most_frequent_trip_Tram', 'Most_frequent_trip_Bus',
       'Most_frequent_trip_Motorcycle_or_moped',
       'Destination_most_frequent_trip', 'Frequency_most_frequent_trip',
       'Frequent_trip_distance', 'Concern_environmental_impacts',
       'Preference_tolls_or_traffic_limitation',
       'grouped_Frequent_trip_duration_in_minutes', 'grouped_Region_3',
       'InternetUsers', 'grouped_Nu

In [None]:
columns = [
  "InternetUsers",
  "Concern_environmental_impacts",
  "Would_subsribe_car_sharing_if_available", 
  "Preference_tolls_or_traffic_limitation",
  "Country",
  "Location_of_resudence",
  "Considering_electric_or_hybrid_vehicle_next_purchase",
]

metric = 'VDM'
save_ds = True
vars = len(columns)-1

if metric == 'VDM':
  categorical_ix = [i for i in range(vars)]
else:
  categorical_ix = [i for i in range(2, vars)]

df_DNA = df[columns]

In [None]:
df_DNA

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,Considering_electric_or_hybrid_vehicle_next_purchase
0,75,5,"Maybe yes, maybe not. I would need to test the...",No preferences,Belgium,Metropolitan area of a big city with more than...,Maybe yes maybe not
1,71,8,"Maybe yes, maybe not. I would need to test the...",Probably more acceptable to limit road traffic,France,Small or medium town (less than 250.000 inhabi...,Probably not
2,63,6,"No, I would not be interested in this service",No preferences,Czech Republic,Large city (from 250.000 to 1.000.000 inhabita...,Certainly not
3,85,8,Don't know / No answer,Probably more acceptable to pay for less conge...,Sweden,Metropolitan area of a big city with more than...,Maybe yes maybe not
4,50,8,"Maybe yes, maybe not. I would need to test the...",No preferences,Poland,Metropolitan area of a big city with more than...,Probably yes
...,...,...,...,...,...,...,...
26600,63,3,"Yes, instead of purchasing a new car",Probably more acceptable to pay for less conge...,Cyprus,Rural area,Don't know/no answer
26601,63,5,Don't know / No answer,No preferences,Cyprus,Small or medium town (less than 250.000 inhabi...,Don't know/no answer
26602,63,7,"Maybe yes, maybe not. I would need to test the...",No preferences,Cyprus,Small or medium town (less than 250.000 inhabi...,Maybe yes maybe not
26603,63,7,"Yes, instead of purchasing a new car",Probably more acceptable to limit road traffic,Cyprus,Small or medium town (less than 250.000 inhabi...,Probably yes


# Radar Chart variable value mappings

In [None]:
cleanup_nums = {"Concern_environmental_impacts":
                {
                    '1': 0,
                    '2': 1,
                    '3': 2,
                    '4': 3,
                    '5': 4,
                    "Don't know": 5,
                    '6': 6,
                    '7': 7,
                    '8': 8,
                    '9': 9,
                    '10': 10 
                 },
                "Would_subsribe_car_sharing_if_available":
                { # no should weigh 4
                    'No, I would not be interested in this service': 0.,
                    "Don't know / No answer": .45,
                    'Maybe yes, maybe not. I would need to test the service before taking a decision': 0.55,
                    'Yes without any influence on my car ownership': 0.7,
                    'Yes, instead of purchasing a new car': 0.8,
                    'Yes and I would give up one car I currently own': 0.9,
                    "Yes I'm already client of a car sharing service": 1.
                },
                "Preference_tolls_or_traffic_limitation":
                {
                    'Definitely more acceptable to pay for less congestion': 0.,
                    'Probably more acceptable to pay for less congestion': 0.25,
                    'No preferences': 0.5,
                    'Probably more acceptable to limit road traffic': 0.75,
                    'Definitely more acceptable to limit road traffic': 1.,
                },
                #"Considering_electric_or_hybrid_vehicle_next_purchase":
                #{
                #    'Certainly not':0,
                #    'Probably not':1,                   
                #    "Don't know/no answer":2,
                #    'Maybe yes maybe not':3,
                #    'Probably yes':4,
                #    'Certainly yes':5,
                #},
                "Location_of_resudence":
                {
                    'Rural area': 0.,
                    'Small or medium town (less than 250.000 inhabitants)': 0.33,
                    'Large city (from 250.000 to 1.000.000 inhabitants)': 0.66, 
                    'Metropolitan area of a big city with more than 1.000.000  inhabitants': 1.,
                }
}

In [None]:
numerical_cols = ["InternetUsers", 
                  "Concern_environmental_impacts",
                  "Would_subsribe_car_sharing_if_available", 
                  "Preference_tolls_or_traffic_limitation"]
if len(df_DNA.columns) == 7:
  numerical_cols.append("Location_of_resudence")

categorical_cols = ["Country"]
target_col = "Considering_electric_or_hybrid_vehicle_next_purchase"
df_numerical = df_DNA[numerical_cols]

In [None]:
df_numerical = df_numerical.replace(cleanup_nums)
df = df_DNA.copy()
df[numerical_cols] = df_numerical
df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,Considering_electric_or_hybrid_vehicle_next_purchase
0,75,4,0.55,0.50,Belgium,1.00,Maybe yes maybe not
1,71,8,0.55,0.75,France,0.33,Probably not
2,63,6,0.00,0.50,Czech Republic,0.66,Certainly not
3,85,8,0.45,0.25,Sweden,1.00,Maybe yes maybe not
4,50,8,0.55,0.50,Poland,1.00,Probably yes
...,...,...,...,...,...,...,...
26600,63,2,0.80,0.25,Cyprus,0.00,Don't know/no answer
26601,63,4,0.45,0.50,Cyprus,0.33,Don't know/no answer
26602,63,7,0.55,0.50,Cyprus,0.33,Maybe yes maybe not
26603,63,7,0.80,0.75,Cyprus,0.33,Probably yes


In [None]:
true_numerical_cols = ["InternetUsers", "Concern_environmental_impacts"]
x = df[true_numerical_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_numerical = pd.DataFrame(x_scaled)
df_numerical.columns = true_numerical_cols

In [None]:
df[true_numerical_cols] = df_numerical
df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,Considering_electric_or_hybrid_vehicle_next_purchase
0,0.758621,0.4,0.55,0.50,Belgium,1.00,Maybe yes maybe not
1,0.689655,0.8,0.55,0.75,France,0.33,Probably not
2,0.551724,0.6,0.00,0.50,Czech Republic,0.66,Certainly not
3,0.931034,0.8,0.45,0.25,Sweden,1.00,Maybe yes maybe not
4,0.327586,0.8,0.55,0.50,Poland,1.00,Probably yes
...,...,...,...,...,...,...,...
26600,0.551724,0.2,0.80,0.25,Cyprus,0.00,Don't know/no answer
26601,0.551724,0.4,0.45,0.50,Cyprus,0.33,Don't know/no answer
26602,0.551724,0.7,0.55,0.50,Cyprus,0.33,Maybe yes maybe not
26603,0.551724,0.7,0.80,0.75,Cyprus,0.33,Probably yes


In [None]:
metric_df_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'
metric_df = pd.read_csv(metric_df_path)

In [None]:
df_radar_normalized = df.drop(target_col, axis=1)
df_radar_normalized["Country"] = metric_df["Country"] # label encoding normalized
vars = len(df_radar_normalized.columns)
df_radar_normalized_path = f'{dataset_path}4_DNA_{vars}values_radar_normalized.csv'
df_radar_normalized.to_csv(df_radar_normalized_path, index=False)

In [None]:
some_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'
pd.read_csv(some_path)

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence
0,0.758621,0.4,0.333333,0.50,0.037037,1.000000
1,0.689655,0.8,0.333333,0.75,0.333333,0.333333
2,0.551724,0.6,0.000000,0.50,0.185185,0.666667
3,0.931034,0.8,0.166667,0.25,1.000000,1.000000
4,0.327586,0.8,0.333333,0.50,0.777778,1.000000
...,...,...,...,...,...,...
26600,0.551724,0.2,0.666667,0.25,0.148148,0.000000
26601,0.551724,0.4,0.166667,0.50,0.148148,0.333333
26602,0.551724,0.7,0.333333,0.50,0.148148,0.333333
26603,0.551724,0.7,0.666667,0.75,0.148148,0.333333


# Plot all medoids of a merge

In [9]:
%%bash
source activate scipydev
cd /content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/scipy

python
# --- import packages
import pandas as pd
import numpy as np
import sys
import os

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist, squareform

def vdm_prepare_df(df, cat_ix):
  replace_arr = {}
  for col in df.columns[~cat_ix]:
    replace_arr[col] = { value: key for key, value in enumerate(np.sort(df[col].unique()))}
  return df.replace(replace_arr)

# ------- config vars ---------
algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
radars_path = f'{pictures_path}RadarCharts/'

sys.path.append(f"{base_path}Code")
from postprocessing.merge_cuts import merge_multiple_cuts
from plots.radar_charts import plot_all_medoids_radar
from metrics.silhouette import getSilhouette
from metrics.wss_bss import get_ith_medoid, getMedoidVDM

target_class = "multi"
metric="VDM"
file_type = ".npy"
wss_file_type = ".npy"
vars_values = [6]
#linkages = ["single","complete", "average", "weighted"]
linkages = ["average"]
postprocessing_values = [False, True]

vars_linkage_cuts = {5:
                  {
                    "average":[[2,3,5,7,9],[2,3,5,7,9,12]],
                    "complete":[[2,4],[2,4,6],[2,4,6,8]],
                    "weighted":[[2,4,6,8],[2,4,6,8,11]],
                    "single":[[2,4,6,14,19,24,28,33,49,86]]
                  },
                  6:
                  {
                    "average": [[2,3,5,7,9]],
                    #"complete":[[2,3,5]],
                    #"weighted":[[2,4,6,8,11,13]],
                    #"single":[[2,4,6,15,19,25,29,31,41,59,75,85,124]]
                  }
}
iths_medoid = [0,1,2,3,4]

# ------- compute medoid -------
for vars in vars_values:
  print(f"vars: {vars} ...")
  # -------- load dataset ---------
  ds_path = f'{dataset_path}4_DNA_{vars}values_radar_normalized.csv'
  df = pd.read_csv(ds_path)

  if metric == 'VDM':
    cat_ix = np.array([i in range(2, vars) for i in range(vars)])
    vdm_df = vdm_prepare_df(df, cat_ix)

  try:
    del distance_matrix
  except:
    distance_matrix = []
  distance_matrix = np.load(f"{results_path}distance_matrix_{metric}{vars}_{target_class}.npy")
  
  for link in linkages:
    print(f"Link: {link} ...")
    # -------- load linkage and compute matrix --------
    folder_path = f"{HC_base_path}{metric}{vars}/{target_class}/"
    HC_path = f"{folder_path}{algo}_{metric}_{link}{file_type}"
    Z = {}
    Z[metric] = np.load(HC_path)

    for cut_values in vars_linkage_cuts[vars][link]:
      print(f"cut_values: {cut_values} ...")
      for postprocessing in postprocessing_values:
        print(f"postprocessing: {postprocessing} ...")
        is_postprocessing = "_fix" if postprocessing else ""
        print("Z:",Z[metric])
        print("cut_values:",cut_values)

        cluster_labels = merge_multiple_cuts(Z[metric], cut_values)
        if postprocessing:
          _, _, cluster_labels = getSilhouette(distance_matrix, cluster_labels, postprocessing)

        folder_path = f"{HC_base_path}{metric}{vars}/{target_class}/ClusterLabels/"
        folder_exists = os.path.isdir(folder_path)
        # If folder doesn't exist, then create it.
        if not folder_exists:
          os.makedirs(folder_path)
          print("created folder : ", folder_path)
        cluster_labels_path = f"{folder_path}ClusterLabels_{metric}_{link}_{str(cut_values)}{is_postprocessing}_.npy"
        np.save(cluster_labels_path, cluster_labels)
        vdm_df["cluster"] = cluster_labels
        df["cluster"] = cluster_labels

        cluster_values, cluster_labels_counts = np.unique(cluster_labels, return_counts=True)
        cluster_values, cluster_labels_counts = \
        cluster_values[cluster_values > 0], cluster_labels_counts[cluster_values > 0] # do not get outlier label
        print(cluster_values, cluster_labels_counts)
        for ith in iths_medoid:
          medoids = []
          for i, cluster in enumerate(cluster_values):
            vdm_df_cluster = vdm_df.loc[vdm_df["cluster"]==cluster].drop("cluster", axis=1)
            df_cluster = df.loc[df["cluster"]==cluster]
            df_cluster = df_cluster.assign(cluster_count=cluster_labels_counts[i])
            #medoid = getMedoidVDM(vdm_df_cluster, df_cluster, metric)
            medoid = get_ith_medoid(vdm_df_cluster, df_cluster, metric, ith)
            medoids.append(medoid)
          medoids_df = pd.DataFrame(medoids)
          medoids_df["cluster"] = medoids_df["cluster"].astype(np.int64)
          medoids_df["cluster_count"] = medoids_df["cluster_count"].astype(np.int64)
          medoids_df = medoids_df.sort_values(by='cluster_count', ascending=False)

          folder_path = f"{HC_base_path}{metric}{vars}/{target_class}/Medoids/"
          folder_exists = os.path.isdir(folder_path)
          # If folder doesn't exist, then create it.
          if not folder_exists:
            os.makedirs(folder_path)
            print("created folder : ", folder_path)

          medoids_paths = f"{folder_path}Medoids{ith}_{metric}_{link}_{str(cut_values)}{is_postprocessing}.csv"
          medoids_df.to_csv(medoids_paths, index=False)

          folder_path = f"{radars_path}{metric}{vars}/{target_class}/Medoids/"
          folder_exists = os.path.isdir(folder_path)
          # If folder doesn't exist, then create it.
          if not folder_exists:
            os.makedirs(folder_path)
            print("created folder : ", folder_path)
          medoids_radar_paths = f"{folder_path}Medoids{ith}_{metric}_{link}_{str(cut_values)}{is_postprocessing}.html"
          plot_all_medoids_radar(medoids_df.drop("cluster_count", axis=1), medoids_df["cluster_count"].values, save_path=medoids_radar_paths)

vars: 6 ...
Link: average ...
cut_values: [2, 3, 5, 7, 9] ...
postprocessing: False ...
Z: [[0.00000000e+00 1.02220000e+04 0.00000000e+00 2.00000000e+00]
 [1.00000000e+00 1.23390000e+04 0.00000000e+00 2.00000000e+00]
 [2.00000000e+00 7.88200000e+03 0.00000000e+00 2.00000000e+00]
 ...
 [5.31990000e+04 5.32040000e+04 1.10520397e-01 1.48600000e+03]
 [5.31960000e+04 5.32050000e+04 1.27305737e-01 2.51190000e+04]
 [5.32060000e+04 5.32070000e+04 2.29885010e-01 2.66050000e+04]]
cut_values: [2, 3, 5, 7, 9]
max_split: 25119
[[    2   514]
 [    3 24605]]
[2 3]
k1(2): [1 2]
k2(3): [2 3]
[2] [2 3]
[2] [5 6]
max_split: 24605
[[    4   966]
 [    5 23639]]
[4 5]
k1(2): [1 5 6]
k2(5): [4 5]
[6] [4 5]
[6] [11 12]
max_split: 23639
[[    6 20110]
 [    7  3529]]
[6 7]
k1(2): [ 1  5 11 12]
k2(7): [6 7]
[12] [6 7]
[12] [19 20]
max_split: 20110
[[    6 10709]
 [    7  9401]]
[6 7]
k1(2): [ 1  5 11 19 20]
k2(9): [6 7]
[19] [6 7]
[19] [27 28]
[ 1  5 11 20 27 28] [ 1486   514   966  3529 10709  9401]
[ 60.737

tcmalloc: large alloc 5662613504 bytes == 0x55cec8304000 @  0x7f31aa0a31e7 0x7f31a7afc98d 0x7f31a7afca07 0x7f31a7b3ceaa 0x7f31a7b411c4 0x7f31a7bd636f 0x55cec2cfbb5c 0x55cec2d32240 0x55cec2d761bc 0x55cec2ccabb3 0x55cec2ceb223 0x55cec2d320c5 0x55cec2d761bc 0x55cec2ccabb3 0x55cec2ceb1c1 0x55cec2d320c5 0x55cec2d79602 0x55cec2ccabb3 0x55cec2ccbee3 0x55cec2dd9802 0x55cec2de394e 0x55cec2de3b3b 0x55cec2ca2b53 0x55cec2de4ccc 0x7f31a92dcbf7 0x55cec2d89555


## Test plot medoid radar

In [None]:
metric = "VDM"
vars = 5
link = "single"
cut_values = [2,4,6,14,19,24,28,33,49,86]
medoids_paths = f"{HC_base_path}{metric}{vars}/Medoids/Medoids_{metric}_{link}_{str(cut_values)}.csv"
medoids_df = pd.read_csv(medoids_paths)
medoids_df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,cluster,cluster_count
0,0.5,0.7,0.55,0.5,0.888889,83,16680.0
1,0.5,0.8,0.0,1.0,0.888889,60,4046.0
2,0.5,0.8,0.8,0.0,0.888889,2,1486.0
3,0.603448,0.7,0.0,0.5,0.333333,13,1401.0
4,0.448276,0.5,0.0,0.5,0.481481,4,966.0
5,0.793103,0.7,0.0,0.5,0.259259,42,882.0
6,0.448276,0.0,0.0,0.5,0.481481,9,514.0
7,0.758621,0.7,0.0,0.5,0.037037,59,479.0
8,0.724138,0.8,0.55,0.5,0.037037,14,92.0
9,0.87931,0.7,0.0,0.5,0.407407,21,59.0


In [None]:
import plotly.express as px
def plot_individual_radar(fig, polar_args, r, theta, index, row=1, col=1):
  fig.add_trace(go.Scatterpolar(
                        r=r,
                        theta=theta,
                        showlegend=False,
                        fill='toself',
                        line_color=px.colors.qualitative.Plotly[index%len(px.colors.qualitative.Plotly)]), 
                    row=row, col=col
                  )
  polar_args[f"polar{index}"] = dict(radialaxis=dict(visible=True,
                                                      range=[0.0, 1.0]
                                                      ),
                                     )

  return fig, polar_args

In [None]:
def plot_all_medoids_radar(medoids_df, cluster_labels_counts, plot_cols=3,row_height=300, save_path=None):
  cols = plot_cols
  rows = int(np.ceil(medoids_df.shape[0]/cols))
  print(f"rows: {rows}, cols: {cols}")
  #radar_title = f"Radars of the most representative person in each cluster"
  titles = [str(i) for i in cluster_labels_counts] # get cluster size as title
  specs = [[{'type': 'polar'}]*cols]*rows
  fig = make_subplots(rows=rows, cols=cols,
                            specs=specs,
                            horizontal_spacing=0.3/cols,
                            vertical_spacing=0.4/rows,
                            subplot_titles=titles,
                            )
  polar_args = {}
  for i in range(rows):
    for j in range(cols):
      #print(f"plotting: {i+1},{j+1}. Index = {i*cols+j+1}")
      if i*cols+j < medoids_df.shape[0]:
        if medoids_df.drop("cluster", axis=1).shape[1] == 5:
          r = medoids_df.iloc[i*cols+j]
          theta = ["IUsers", "EnvImpact", "CarShare", "TollsTraffic"]
        elif medoids_df.drop("cluster", axis=1).shape[1] == 6:
          r = medoids_df.drop("Country", axis=1).iloc[i*cols+j]
          theta = ["IUsers", "EnvImpact", "CarShare", "TollsTraffic", "Residence"]
        fig, polar_args = plot_individual_radar(fig, polar_args, 
                                                r, theta, 
                                                index=i*cols+j+1,
                                                row=i+1,col=j+1)
  fig.update_layout(
    height=row_height*rows,
    **polar_args)
  
  # Fixing sublot title positioning:
  # https://stackoverflow.com/questions/65775407/can-you-alter-a-subplot-title-location-in-plotly
  fig.update_annotations(patch=dict(yshift=20)) 
  fig.show()

  if save_path:
    fig.write_html(save_path)

In [None]:
medoids_df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,cluster
0,0.5,0.8,0.8,0.0,0.888889,0.33,1.0
1,0.448276,0.0,0.0,0.5,0.481481,0.33,5.0
2,0.448276,0.5,0.0,0.5,0.481481,0.0,11.0
3,0.5,1.0,0.45,0.75,0.925926,0.33,20.0
4,0.568966,0.7,0.55,0.5,0.962963,0.0,27.0
5,0.689655,0.7,0.45,0.5,0.333333,0.0,28.0


In [None]:
from postprocessing.merge_cuts import merge_multiple_cuts

In [None]:
HC_path = f"{HC_base_path}{metric}{vars}/{algo}_{metric}_{link}.npy"
Z = {}
Z[metric] = np.load(HC_path)
cluster_labels = merge_multiple_cuts(Z[metric], cut_values)
_, cluster_labels_counts = np.unique(cluster_labels, return_counts=True)

k1: [1 2]
k2: [2, 3]
k1: [1 5 6]
k2: [4, 5]
k1: [ 1  5 11 12]
k2: [6, 7]
k1: [ 1  5 11 19 20]
k2: [6, 7]


In [None]:
medoids_radar_paths = f"{radars_path}{metric}{vars}/Medoids/Medoids_{metric}_{link}_{str(cut_values)}.html"
plot_all_medoids_radar(medoids_df, cluster_labels_counts, save_path=medoids_radar_paths)#

rows: 2, cols: 3


# Plot medoids wrt Target

In [None]:
%%bash
source activate scipydev
cd /content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/scipy

python
# --- import packages
import pandas as pd
import numpy as np
import sys
import os

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist, squareform

def vdm_prepare_df(df, cat_ix):
  replace_arr = {}
  for col in df.columns[~cat_ix]:
    replace_arr[col] = { value: key for key, value in enumerate(np.sort(df[col].unique()))}
  return df.replace(replace_arr)


def target_sorter(column):
  """Sort function"""
  answers = ["Certainly not","Probably not", "Don't know/no answer",'Maybe yes maybe not',"Probably yes","Certainly yes"]
  correspondence = {answer: order for order, answer in enumerate(answers)}
  return column.map(correspondence)

# ------- config vars ---------
algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
radars_path = f'{pictures_path}RadarCharts/'

sys.path.append(f"{base_path}Code")
from postprocessing.merge_cuts import merge_multiple_cuts
from plots.radar_charts import plot_all_medoids_radar
from metrics.silhouette import getSilhouette
from metrics.wss_bss import get_ith_medoid, getMedoidVDM

metric="VDM"
file_type = ".npy"
wss_file_type = ".npy"
vars_values = [5,6]
target = "Considering_electric_or_hybrid_vehicle_next_purchase"
target_class_types = ["multi","binary"]
iths_medoid = [0,1,2,3,4,5,6,7,8,9]

# ------- compute medoid -------
for vars in vars_values:
  print(f"vars: {vars} ...")
  # -------- load dataset ---------
  ds_path = f'{dataset_path}4_DNA_{vars}values_radar_normalized.csv'
  df = pd.read_csv(ds_path)

  ds_target_path = f'{dataset_path}3_filtered_values.csv'
  df_target = pd.read_csv(ds_target_path)
  

  if metric == 'VDM':
    cat_ix = np.array([i in range(2, vars) for i in range(vars)])
    vdm_df = vdm_prepare_df(df, cat_ix)

  for target_class in target_class_types:
    print(f"******* target_class: {target_class}")
    if target_class == "binary":
      mask = ~df_target[target].isin(['Maybe yes maybe not', "Don't know/no answer"])
      df_target = df_target[mask]
      vdm_df = vdm_df[mask]
      df = df[mask]
      print("******No more maybes")
      target_map = {
          "Probably yes": "YES",
          "Certainly yes": "YES",
          "Probably not": "NO",
          "Certainly not": "NO",
      }
      print(df_target[target].unique())
      df_target[target] = df_target[target].replace(target_map)
      
    cluster_labels = df_target[target].values
    vdm_df["cluster"] = cluster_labels
    df["cluster"] = cluster_labels

    cluster_values, cluster_labels_counts = np.unique(cluster_labels, return_counts=True)
    print(cluster_values, cluster_labels_counts)
    
    for ith in iths_medoid:
      medoids = []
      for i, cluster in enumerate(cluster_values):
        vdm_df_cluster = vdm_df.loc[vdm_df["cluster"]==cluster].drop("cluster", axis=1)
        df_cluster = df.loc[df["cluster"]==cluster]
        df_cluster = df_cluster.assign(cluster_count=cluster_labels_counts[i])
        #medoid = getMedoidVDM(vdm_df_cluster, df_cluster, metric)
        medoid = get_ith_medoid(vdm_df_cluster, df_cluster, metric, ith)
        medoids.append(medoid)
      medoids_df = pd.DataFrame(medoids)
      if target_class == "multi":
        print(f"******* sorting multi")
        medoids_df = medoids_df.sort_values(by='cluster', key=target_sorter)
        #print(medoids_df)

      folder_path = f"{HC_base_path}{metric}{vars}/{target_class}/Medoids/Target/Medoids{ith+1}/"
      folder_exists = os.path.isdir(folder_path)
      # If folder doesn't exist, then create it.
      if not folder_exists:
        os.makedirs(folder_path)
        print("created folder : ", folder_path)

      medoids_paths = f"{folder_path}Medoids_{metric}_{target_class}.csv"
      medoids_df.to_csv(medoids_paths, index=False)

      folder_path = f"{radars_path}{metric}{vars}/{target_class}/Medoids/Target/Medoids{ith+1}/"
      folder_exists = os.path.isdir(folder_path)
      # If folder doesn't exist, then create it.
      if not folder_exists:
        os.makedirs(folder_path)
        print("created folder : ", folder_path)
      medoids_radar_paths = f"{folder_path}Medoids_{metric}_{target_class}.html"
      plot_all_medoids_radar(medoids_df.drop("cluster_count", axis=1), medoids_df["cluster_count"].values, save_path=medoids_radar_paths)

vars: 5 ...
******* target_class: multi
['Certainly not' 'Certainly yes' "Don't know/no answer"
 'Maybe yes maybe not' 'Probably not' 'Probably yes'] [2647 2781 2139 8042 5312 5684]
[145.85585    145.85585    145.85585    ... 701.48219924 720.14120713
 723.25939894]
[172.26396065 172.74489058 174.23777236 ... 712.02026672 733.75861049
 765.35123956]
[116.93916916 119.00578156 119.00578156 ... 543.34399035 549.61787509
 580.0102343 ]
[ 383.50065047  383.50065047  383.50065047 ... 2150.22337024 2153.81593366
 2210.1084314 ]
[ 254.54822508  254.54822508  254.54822508 ... 1467.70995727 1481.79221687
 1543.97460969]
[ 297.63736628  297.63736628  297.63736628 ... 1531.28606373 1546.5269177
 1555.33614919]
******* sorting multi
rows: 2, cols: 3
[145.85585    145.85585    145.85585    ... 701.48219924 720.14120713
 723.25939894]
[172.26396065 172.74489058 174.23777236 ... 712.02026672 733.75861049
 765.35123956]
[116.93916916 119.00578156 119.00578156 ... 543.34399035 549.61787509
 580.0102343

## Test radar plots wrt target

In [None]:
metric = "VDM"
vars = 6
target_class = "multi"
medoids_paths = f"{HC_base_path}{metric}{vars}/Medoids/Medoids_{metric}_{target_class}.csv"
medoids_df = pd.read_csv(medoids_paths)
medoids_df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,cluster,cluster_count
0,0.448276,0.4,0.45,0.5,0.481481,0.0,Certainly not,2647
1,0.448276,0.7,0.0,0.5,0.481481,0.0,Probably not,5312
2,0.448276,0.7,0.0,0.5,0.481481,0.33,Don't know/no answer,2139
3,0.448276,0.7,0.0,0.5,0.481481,0.33,Maybe yes maybe not,8042
4,0.448276,0.8,0.55,0.5,0.481481,0.0,Probably yes,5684
5,0.5,0.8,0.55,0.5,0.925926,0.0,Certainly yes,2781


In [None]:
metric = "VDM"
vars = 6
target_class = "multi"
medoids_paths = f"{HC_base_path}{metric}{vars}/Medoids2/Medoids_{metric}_{target_class}.csv"
medoids_df = pd.read_csv(medoids_paths)
medoids_df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,cluster,cluster_count
0,0.448276,0.4,0.55,0.5,0.481481,0.33,Certainly not,2647
1,0.448276,0.7,0.7,0.5,0.481481,0.0,Probably not,5312
2,0.741379,0.7,0.45,0.5,0.407407,0.33,Don't know/no answer,2139
3,0.448276,0.7,0.55,0.5,0.481481,0.33,Maybe yes maybe not,8042
4,0.5,0.8,0.55,0.5,0.888889,0.0,Probably yes,5684
5,0.5,0.8,0.55,0.5,0.888889,0.33,Certainly yes,2781


In [None]:
medoids_df.index[1]

1

In [None]:
medoids_df.drop(medoids_df.index[0]).index[0]

1

In [None]:
((medoids_df.InternetUsers == medoids_df.iloc[0].InternetUsers) &
                (medoids_df.Concern_environmental_impacts == medoids_df.iloc[0].Concern_environmental_impacts) &
                (medoids_df.Would_subsribe_car_sharing_if_available == medoids_df.iloc[0].Would_subsribe_car_sharing_if_available) &
                (medoids_df.Preference_tolls_or_traffic_limitation == medoids_df.iloc[0].Preference_tolls_or_traffic_limitation) &
                (medoids_df.Country == medoids_df.iloc[0].Country) &
                (medoids_df.Location_of_resudence == medoids_df.iloc[0].Location_of_resudence))

0     True
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [None]:
medoids_df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,cluster,cluster_count
0,0.448276,0.4,0.45,0.5,0.481481,0.0,Certainly not,2647
1,0.448276,0.7,0.0,0.5,0.481481,0.0,Probably not,5312
2,0.448276,0.7,0.0,0.5,0.481481,0.33,Don't know/no answer,2139
3,0.448276,0.7,0.0,0.5,0.481481,0.33,Maybe yes maybe not,8042
4,0.5,0.8,0.55,0.5,0.888889,0.0,Probably yes,5684
5,0.5,0.8,0.55,0.5,0.925926,0.0,Certainly yes,2781


In [None]:
def target_sorter(column):
    """Sort function"""
    answers = ["Certainly not","Probably not", "Don't know/no answer",'Maybe yes maybe not',"Probably yes","Certainly yes"]
    correspondence = {answer: order for order, answer in enumerate(answers)}
    return column.map(correspondence)


In [None]:
sorters = {
  "multi": ["Certainly not","Probably not", "Don't know/no answer",'Maybe yes maybe not',"Probably yes","Certainly yes"],
  "binary": ["NO", "YES"]
}
medoids_df = medoids_df.sort_values(by='cluster', key=tm_sorter)
medoids_df

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Country,Location_of_resudence,cluster
0,0.448276,0.4,0.45,0.5,0.481481,0.0,Certainly not
4,0.448276,0.7,0.0,0.5,0.481481,0.0,Probably not
2,0.448276,0.7,0.0,0.5,0.481481,0.33,Don't know/no answer
3,0.448276,0.7,0.0,0.5,0.481481,0.33,Maybe yes maybe not
5,0.5,0.8,0.55,0.5,0.888889,0.0,Probably yes
1,0.5,0.8,0.55,0.5,0.925926,0.0,Certainly yes
