In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
%matplotlib inline
import sys
sys.path.append("/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code")
np.set_printoptions(precision=5, suppress=True)

In [3]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/3_filtered_values.csv'

df = pd.read_csv(ds_path)

# Clustering


## DNA selection

In [4]:
columns = [
             "InternetUsers",
             "Concern_environmental_impacts",
             "Would_subsribe_car_sharing_if_available", 
             "Preference_tolls_or_traffic_limitation",
             "Gender",
             "grouped_Age",
             #"grouped_Region_3", 
             #"Country", 
             ]
df_DNA = df[columns]
#categorical_ix = [0, 1, 2, 4, 5, 6, 7]
categorical_ix = [2,3,4,5]

In [5]:
df_DNA["Would_subsribe_car_sharing_if_available"].unique()

array(['Maybe yes, maybe not. I would need to test the service before taking a decision',
       'No, I would not be interested in this service',
       "Don't know / No answer",
       'Yes without any influence on my car ownership',
       'Yes, instead of purchasing a new car',
       'Yes and I would give up one car I currently own',
       "Yes I'm already client of a car sharing service"], dtype=object)

In [6]:
df_DNA["grouped_Age"].unique()

array(['47:51', '23:27', '54:85', '35:39', '39:43', '27:30', '16:23',
       '43:47', '30:35', '51:54'], dtype=object)

In [7]:
df_DNA.loc[df_DNA["grouped_Age"]==2]

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,grouped_Age


In [8]:
cleanup_nums = {"Concern_environmental_impacts":
                  {
                      '1': 1,
                      '2': 2,
                      '3': 3,
                      '4': 4,
                      '5': 5,
                      '6': 6,
                      '7': 7,
                      '8': 8,
                      '9': 9,
                      '10': 10,
                      "Don't know": 0
                  },
                "Would_subsribe_car_sharing_if_available":
                  {
                      "Don't know / No answer": 0,
                      'No, I would not be interested in this service': 1,
                      'Maybe yes, maybe not. I would need to test the service before taking a decision': 2,
                      'Yes without any influence on my car ownership': 3,
                      'Yes, instead of purchasing a new car': 4,
                      'Yes and I would give up one car I currently own': 5,
                      "Yes I'm already client of a car sharing service": 6
                  },
                "Preference_tolls_or_traffic_limitation":
                  {
                      'No preferences': 0,
                      'Probably more acceptable to limit road traffic': 1,
                      'Probably more acceptable to pay for less congestion': 2,
                      'Definitely more acceptable to pay for less congestion': 3,
                      'Definitely more acceptable to limit road traffic': 4
                  },
                "Gender":
                  {
                      'Female': 0,
                      'Male': 1
                  },
                "grouped_Age":
                  {
                      '16:23': 0,
                      '23:27': 1,
                      '23:27': 2,
                      '27:30': 3,
                      '30:35': 4,
                      '35:39': 5, 
                      '39:43': 6,
                      '43:47': 7,
                      '47:51': 8,
                      '51:54': 9,
                      '54:85': 10,
                  }
                }

In [9]:
df_DNA = df_DNA.replace(cleanup_nums)

In [10]:
df_DNA.dtypes

InternetUsers                              int64
Concern_environmental_impacts              int64
Would_subsribe_car_sharing_if_available    int64
Preference_tolls_or_traffic_limitation     int64
Gender                                     int64
grouped_Age                                int64
dtype: object

In [11]:
df_DNA

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,grouped_Age
0,75,5,2,0,0,8
1,71,8,2,1,1,2
2,63,6,1,0,1,8
3,85,8,0,2,1,10
4,50,8,2,0,1,5
...,...,...,...,...,...,...
26600,63,3,4,2,0,3
26601,63,5,0,0,0,4
26602,63,7,2,0,1,4
26603,63,7,4,1,1,5


### Normalize variables

In [12]:
x = df_DNA.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_DNA = pd.DataFrame(x_scaled)
df_DNA.columns = columns

In [13]:
x_original = min_max_scaler.inverse_transform(x_scaled)
df_DNA_original = pd.DataFrame(x_original)

In [14]:
df_DNA

Unnamed: 0,InternetUsers,Concern_environmental_impacts,Would_subsribe_car_sharing_if_available,Preference_tolls_or_traffic_limitation,Gender,grouped_Age
0,0.758621,0.5,0.333333,0.00,0.0,0.8
1,0.689655,0.8,0.333333,0.25,1.0,0.2
2,0.551724,0.6,0.166667,0.00,1.0,0.8
3,0.931034,0.8,0.000000,0.50,1.0,1.0
4,0.327586,0.8,0.333333,0.00,1.0,0.5
...,...,...,...,...,...,...
26600,0.551724,0.3,0.666667,0.50,0.0,0.3
26601,0.551724,0.5,0.000000,0.00,0.0,0.4
26602,0.551724,0.7,0.333333,0.00,1.0,0.4
26603,0.551724,0.7,0.666667,0.25,1.0,0.5


In [15]:
ds_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Survey+dataset/4_DNA_values.csv'

df_DNA.to_csv(ds_path)

## Load cluster data

In [None]:
from scipy.cluster.hierarchy import fcluster
HC_path = '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/Data/HC_complete_heom.csv'

In [None]:
Z = {}
Z['HEOM'] = np.loadtxt(HC_path, delimiter=',')
cluster = fcluster(Z['HEOM'], 3, criterion='maxclust')

## Distance Metrics


In [None]:
from DistanceMetrics.heom_c import HEOM_C
from DistanceMetrics.euclidean import euclidean
heom_metric = HEOM_C(df_DNA, categorical_ix)

In [None]:
import timeit
def heom_f():
  heom_metric.heom(df_DNA.iloc[0], df_DNA.iloc[1])
def euclidean_f():
  euclidean(df_DNA.iloc[0].values, df_DNA.iloc[1].values)
num_runs = 10000
duration_heom = timeit.Timer(heom_f).timeit(number = num_runs)
duration_euclidean = timeit.Timer(euclidean_f).timeit(number = num_runs)
avg_duration_heom = duration_heom/num_runs
avg_duration_euclidean = duration_euclidean/num_runs
print(f'HEOM on average it took {avg_duration_heom} seconds')
print(f'Euclidean on average it took {avg_duration_euclidean} seconds')


HEOM on average it took 0.00021941126229994552 seconds
Euclidean on average it took 0.00015571328300002278 seconds


## Plot grids

### TSNE

In [None]:
from sklearn.manifold import TSNE

perplexity=50
n_components=3
tsne = TSNE(n_components=n_components, 
            verbose=1, 
            perplexity=perplexity, 
            random_state=2867976,
            metric=euclidean
            )
# tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=300)
tsne_result = tsne.fit_transform(df_DNA)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 26605 samples in 0.513s...


In [None]:
df_dict = {}
for i in df_DNA.columns:
  df_dict[i] = df_DNA[i].values
df_dict

In [None]:
if n_components == 2:
  components = {'x':tsne_result[:,0], 'y':tsne_result[:,1]}
else:
  components = {'x':tsne_result[:,0], 'y':tsne_result[:,1], 'z':tsne_result[:,2]}

df_tsne = pd.DataFrame({**components, **df_dict})
df_tsne.shape

In [None]:
df_tsne["cluster"] = cluster

In [None]:
plt.rcParams['figure.figsize'] = (20.0, 10.0)
sns.scatterplot(data=df_tsne, 
            x="x", 
            y="y", 
            hue="cluster",
            palette="icefire"       
            )

In [None]:
#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

def plotInteractive(df, column, dims=2):
  data = []
  for value in df[column].unique():
    if dims == 2:
      args ={"x":df.loc[df[column]==value,"x"], "y":df.loc[df[column]==value,"y"]}
      trace = go.Scatter(
                      mode = "markers",
                      name = "Cluster " + str(value),
                      #marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                      text = None,
                      **args)
    else:
      args = {"x":df.loc[df[column]==value,"x"], "y":df.loc[df[column]==value,"y"], "z":df.loc[df[column]==value,"z"]}
      trace = go.Scatter3d(
                      mode = "markers",
                      name = "Cluster " + str(value),
                      #marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                      text = None,
                      **args)
    
    
    data.append(trace)

  title = "Visualizing Clusters in" + str(dims) + " Dimensions Using T-SNE"

  args = {"xaxis": dict(title= 'x',ticklen= 5,zeroline= False), "yaxis":dict(title= 'y',ticklen= 5,zeroline= False)}
  layout = dict(title = title,
                **args)

  fig = dict(data = data, layout = layout)

  iplot(fig)

In [None]:
column = "cluster"
dims = 2
plotInteractive(df_tsne, column, dims)

In [None]:
column = "cluster"
dims = 3
plotInteractive(df_tsne, column, dims)