In [1]:
# mount drive 
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# import 
import numpy as np
from scipy.stats import norm
import pandas as pd

import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.metrics import confusion_matrix, accuracy_score

from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

pio.templates.default = "plotly_white"


  import pandas.util.testing as tm


In [3]:
# loading data
data = pd.read_table("/content/drive/My Drive/Colab Notebooks/project stat/very_final_table_paper.csv", sep=";", header=0)

In [4]:
# transform drugs, diagnostic, gender to a binary features

# 1 : if drugs 1 otherwise 0
def create_drugs_col(entry):
  if entry == "['-1']":
    return 0

  return 1


# 1 : if diagnostic 1 otherwise 0
def create_diagnostic_col(entry):
  if entry == "['Aucun']":
    return 0

  return 1

data["take_drugs"] = data["drugs"].apply(create_drugs_col)
data["has_symptom"] = data["diagnostic"].apply(create_diagnostic_col)
data["sex"] = data["gender"].map({"female": 0, "male": 1})

In [5]:
clusters = {
  "cluster_0": [
    "FEATURE_maximum_value_ML_opened_eyes",
    "FEATURE_maximum_value_AP_opened_eyes",
    "FEATURE_maximum_value_Radius_opened_eyes",
    "FEATURE_mean_velocity_AP_opened_eyes",
    "FEATURE_mean_velocity_ML_AND_AP_opened_eyes",
    "FEATURE_mean_distance_ML_opened_eyes",
    "FEATURE_mean_distance_AP_opened_eyes",
    "FEATURE_mean_distance_Radius_opened_eyes",
    "FEATURE_RMS_ML_opened_eyes",
    "FEATURE_RMS_AP_opened_eyes",
    "FEATURE_RMS_Radius_opened_eyes",
    "FEATURE_amplitude_ML_opened_eyes",
    "FEATURE_amplitude_AP_opened_eyes",
    "FEATURE_amplitude_ML_AND_AP_opened_eyes",
    "FEATURE_sway_length_AP_opened_eyes",
    "FEATURE_sway_length_ML_AND_AP_opened_eyes",
    "FEATURE_Quotient_both_direction_ML_AND_AP_opened_eyes",
    "FEATURE_planar_deviation_ML_AND_AP_opened_eyes",
    "FEATURE_peak_velocity_all_SPD_AP_opened_eyes",
    "FEATURE_peak_velocity_pos_SPD_AP_opened_eyes",
    "FEATURE_peak_velocity_neg_SPD_AP_opened_eyes",
    "FEATURE_mean_distance_peak_Sway_Density_opened_eyes",
    "FEATURE_phase_plane_parameters_AP_opened_eyes",
    "FEATURE_critical_time_Diffusion_AP_opened_eyes",
    "FEATURE_maximum_value_AP_closed_eyes",
    "FEATURE_mean_distance_AP_closed_eyes",
    "FEATURE_mean_distance_Radius_closed_eyes",
    "FEATURE_RMS_AP_closed_eyes",
    "FEATURE_RMS_Radius_closed_eyes",
    "FEATURE_amplitude_AP_closed_eyes",
    "FEATURE_amplitude_ML_AND_AP_closed_eyes",
    "FEATURE_Quotient_both_direction_ML_AND_AP_closed_eyes",
    "FEATURE_planar_deviation_ML_AND_AP_closed_eyes",
    "FEATURE_mean_distance_peak_Sway_Density_closed_eyes",
    "FEATURE_critical_time_Diffusion_ML_closed_eyes",
    "FEATURE_long_time_diffusion_Diffusion_AP_closed_eyes",
    "FEATURE_critical_time_Diffusion_AP_closed_eyes"
  ],
  "cluster_1": [
    "FEATURE_frequency_dispersion_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_frequency_dispersion_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_principal_sway_direction_ML_AND_AP_opened_eyes",
    "FEATURE_long_time_diffusion_Diffusion_ML_opened_eyes",
    "FEATURE_long_time_diffusion_Diffusion_AP_opened_eyes",
    "FEATURE_long_time_scaling_Diffusion_AP_opened_eyes",
    "FEATURE_zero_crossing_SPD_ML_closed_eyes",
    "FEATURE_frequency_dispersion_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_frequency_dispersion_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_principal_sway_direction_ML_AND_AP_closed_eyes",
    "FEATURE_long_time_diffusion_Diffusion_ML_closed_eyes",
    "FEATURE_long_time_scaling_Diffusion_ML_closed_eyes",
    "FEATURE_long_time_scaling_Diffusion_AP_closed_eyes"
  ],
  "cluster_2": [
    "FEATURE_frequency_mode_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_total_power_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_total_power_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_energy_content_0_05_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_energy_content_0_05_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_energy_content_05_2_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_energy_content_05_2_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_energy_content_2_inf_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_energy_content_2_inf_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_frequency_quotient_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_frequency_quotient_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_confidence_ellipse_area_ML_AND_AP_opened_eyes",
    "FEATURE_mean_velocity_ML_opened_eyes",
    "FEATURE_sway_length_ML_opened_eyes",
    "FEATURE_Coefficient_sway_direction_ML_AND_AP_opened_eyes",
    "FEATURE_peak_velocity_all_SPD_ML_opened_eyes",
    "FEATURE_peak_velocity_pos_SPD_ML_opened_eyes",
    "FEATURE_peak_velocity_neg_SPD_ML_opened_eyes",
    "FEATURE_sway_area_per_second_ML_AND_AP_opened_eyes",
    "FEATURE_phase_plane_parameters_ML_opened_eyes",
    "FEATURE_short_time_diffusion_Diffusion_ML_opened_eyes",
    "FEATURE_critical_time_Diffusion_ML_opened_eyes",
    "FEATURE_critical_displacement_Diffusion_ML_opened_eyes",
    "FEATURE_short_time_diffusion_Diffusion_AP_opened_eyes",
    "FEATURE_critical_displacement_Diffusion_AP_opened_eyes",
    "FEATURE_frequency_mode_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_total_power_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_total_power_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_energy_content_0_05_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_energy_content_0_05_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_energy_content_05_2_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_energy_content_05_2_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_energy_content_2_inf_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_energy_content_2_inf_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_frequency_quotient_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_frequency_quotient_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_confidence_ellipse_area_ML_AND_AP_closed_eyes",
    "FEATURE_maximum_value_ML_closed_eyes",
    "FEATURE_maximum_value_Radius_closed_eyes",
    "FEATURE_mean_velocity_ML_closed_eyes",
    "FEATURE_mean_velocity_AP_closed_eyes",
    "FEATURE_mean_velocity_ML_AND_AP_closed_eyes",
    "FEATURE_mean_distance_ML_closed_eyes",
    "FEATURE_RMS_ML_closed_eyes",
    "FEATURE_amplitude_ML_closed_eyes",
    "FEATURE_sway_length_ML_closed_eyes",
    "FEATURE_sway_length_AP_closed_eyes",
    "FEATURE_sway_length_ML_AND_AP_closed_eyes",
    "FEATURE_Coefficient_sway_direction_ML_AND_AP_closed_eyes",
    "FEATURE_peak_velocity_all_SPD_ML_closed_eyes",
    "FEATURE_peak_velocity_all_SPD_AP_closed_eyes",
    "FEATURE_peak_velocity_pos_SPD_ML_closed_eyes",
    "FEATURE_peak_velocity_pos_SPD_AP_closed_eyes",
    "FEATURE_peak_velocity_neg_SPD_ML_closed_eyes",
    "FEATURE_peak_velocity_neg_SPD_AP_closed_eyes",
    "FEATURE_sway_area_per_second_ML_AND_AP_closed_eyes",
    "FEATURE_phase_plane_parameters_ML_closed_eyes",
    "FEATURE_phase_plane_parameters_AP_closed_eyes",
    "FEATURE_short_time_diffusion_Diffusion_ML_closed_eyes",
    "FEATURE_critical_displacement_Diffusion_ML_closed_eyes",
    "FEATURE_short_time_diffusion_Diffusion_AP_closed_eyes",
    "FEATURE_critical_displacement_Diffusion_AP_closed_eyes"
  ],
  "cluster_3": [
    "FEATURE_zero_crossing_SPD_ML_opened_eyes",
    "FEATURE_zero_crossing_SPD_AP_opened_eyes",
    "FEATURE_frequency_mode_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_power_frequency_50_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_power_frequency_50_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_power_frequency_95_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_power_frequency_95_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_centroid_frequency_Power_Spectrum_Density_ML_opened_eyes",
    "FEATURE_centroid_frequency_Power_Spectrum_Density_AP_opened_eyes",
    "FEATURE_mean_peak_Sway_Density_opened_eyes",
    "FEATURE_length_over_area_ML_AND_AP_opened_eyes",
    "FEATURE_mean_frequency_ML_opened_eyes",
    "FEATURE_mean_frequency_AP_opened_eyes",
    "FEATURE_mean_frequency_ML_AND_AP_opened_eyes",
    "FEATURE_fractal_dimension_pd_ML_AND_AP_opened_eyes",
    "FEATURE_fractal_dimension_cc_ML_AND_AP_opened_eyes",
    "FEATURE_fractal_dimension_ce_ML_AND_AP_opened_eyes",
    "FEATURE_long_time_scaling_Diffusion_ML_opened_eyes",
    "FEATURE_zero_crossing_SPD_AP_closed_eyes",
    "FEATURE_frequency_mode_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_power_frequency_50_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_power_frequency_50_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_power_frequency_95_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_power_frequency_95_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_centroid_frequency_Power_Spectrum_Density_ML_closed_eyes",
    "FEATURE_centroid_frequency_Power_Spectrum_Density_AP_closed_eyes",
    "FEATURE_mean_peak_Sway_Density_closed_eyes",
    "FEATURE_length_over_area_ML_AND_AP_closed_eyes",
    "FEATURE_mean_frequency_ML_closed_eyes",
    "FEATURE_mean_frequency_AP_closed_eyes",
    "FEATURE_mean_frequency_ML_AND_AP_closed_eyes",
    "FEATURE_fractal_dimension_pd_ML_AND_AP_closed_eyes",
    "FEATURE_fractal_dimension_cc_ML_AND_AP_closed_eyes",
    "FEATURE_fractal_dimension_ce_ML_AND_AP_closed_eyes"
  ]
}

In [11]:
for cluster_name, cluster in clusters.items():
  print("{0}: {1}".format(cluster_name, len(cluster)))

cluster_0: 37
cluster_1: 13
cluster_2: 62
cluster_3: 34


In [6]:
clusters_vif_scores = dict()

for cluster_name, cols in clusters.items():
  # select columns from data
  X = data[cols]
  
  # create dataframe that will hold
  # features name and corresponding vif_score
  vif_data = pd.DataFrame()
  vif_data["features"] = cols
  vif_data["VIF"] = [VIF(X.values, i)
                          for i in range(len(X.columns))]

  # save a copy of this dataframe
  clusters_vif_scores[cluster_name] = vif_data.copy()


divide by zero encountered in double_scalars



In [7]:
for cluster_name in clusters_vif_scores.keys():
  # sort columns asc by vif score
  clusters_vif_scores[cluster_name].sort_values(by="VIF", inplace=True)

In [8]:
for cluster_name in clusters_vif_scores.keys():
  print("{0}: {1:.2f}".format(cluster_name, clusters_vif_scores[cluster_name]["VIF"].mean()))

cluster_0: 243440539519425.62
cluster_1: 43.37
cluster_2: inf
cluster_3: 1149.02


In [14]:
# keep variables whose threshold is less than
threshold = 100

for cluster_name in clusters_vif_scores.keys():
  cluster = clusters_vif_scores[cluster_name]

  cluster = cluster[cluster["VIF"] <= threshold]

  clusters_vif_scores[cluster_name] = cluster


In [16]:
# count the nb of selected features
nb_selected_features = 0
for cluster in clusters_vif_scores.values():
  nb_selected_features += len(cluster)

nb_selected_features

45

In [17]:
# how much feature were selected in each cluster
final_clusters_proportions = pd.DataFrame(columns=["cluster", "proportion"])


for cluster_name in clusters.keys():
  # compute proportion
  selection_proportion = len(clusters_vif_scores[cluster_name]) / len(clusters[cluster_name])
  
  final_clusters_proportions = final_clusters_proportions.append({'cluster': cluster_name, "proportion": selection_proportion}, ignore_index=True)

In [18]:
fig = px.bar(final_clusters_proportions, x="cluster", y="proportion", title="Selected proportion in each cluster")

fig.update_layout(
    yaxis=dict(
        tickformat=',.0%',
        range=[0, 1]
    )
)
fig.show()


In [19]:
# proportion of each cluster in the final select 
final_clusters_proportions = pd.DataFrame(columns=["cluster", "proportion"])

# compute total nb of featues
total_nb_featues = 0
for cluster_name in clusters.keys():
  total_nb_featues += len(clusters_vif_scores[cluster_name])

for cluster_name in clusters.keys():
  # compute proportion
  proportion = len(clusters_vif_scores[cluster_name]) / total_nb_featues
  
  final_clusters_proportions = final_clusters_proportions.append({'cluster': cluster_name, "proportion": proportion}, ignore_index=True)


In [20]:
# in the final bucket of variables, here is the proportion of each cluster
fig = px.pie(final_clusters_proportions, names="cluster", values="proportion", title="Proportion of each cluster after VIF selection")

fig.show()

In [21]:
# final set of variables
arr_final_selected_features = []

for cluster in clusters_vif_scores.values():
  for feat in cluster["features"].values:
    arr_final_selected_features.append(feat)

In [22]:
arr_final_selected_features

['FEATURE_long_time_diffusion_Diffusion_AP_closed_eyes',
 'FEATURE_critical_time_Diffusion_AP_opened_eyes',
 'FEATURE_critical_time_Diffusion_ML_closed_eyes',
 'FEATURE_critical_time_Diffusion_AP_closed_eyes',
 'FEATURE_Quotient_both_direction_ML_AND_AP_closed_eyes',
 'FEATURE_Quotient_both_direction_ML_AND_AP_opened_eyes',
 'FEATURE_mean_distance_peak_Sway_Density_closed_eyes',
 'FEATURE_mean_distance_peak_Sway_Density_opened_eyes',
 'FEATURE_long_time_diffusion_Diffusion_ML_opened_eyes',
 'FEATURE_long_time_diffusion_Diffusion_ML_closed_eyes',
 'FEATURE_long_time_scaling_Diffusion_ML_closed_eyes',
 'FEATURE_long_time_diffusion_Diffusion_AP_opened_eyes',
 'FEATURE_long_time_scaling_Diffusion_AP_closed_eyes',
 'FEATURE_principal_sway_direction_ML_AND_AP_closed_eyes',
 'FEATURE_principal_sway_direction_ML_AND_AP_opened_eyes',
 'FEATURE_long_time_scaling_Diffusion_AP_opened_eyes',
 'FEATURE_zero_crossing_SPD_ML_closed_eyes',
 'FEATURE_frequency_quotient_Power_Spectrum_Density_ML_opened_e