In [2]:
# mount drive 
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# import 
import numpy as np
from scipy.stats import norm
import pandas as pd

import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.metrics import confusion_matrix, accuracy_score

from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

pio.templates.default = "plotly_white"


  import pandas.util.testing as tm


In [4]:
# loading data
data = pd.read_table("/content/drive/My Drive/Colab Notebooks/project stat/very_final_table_paper.csv", sep=";", header=0)

In [5]:
# transform drugs, diagnostic, gender to a binary features

# 1 : if drugs 1 otherwise 0
def create_drugs_col(entry):
  if entry == "['-1']":
    return 0

  return 1


# 1 : if diagnostic 1 otherwise 0
def create_diagnostic_col(entry):
  if entry == "['Aucun']":
    return 0

  return 1

data["take_drugs"] = data["drugs"].apply(create_drugs_col)
data["has_symptom"] = data["diagnostic"].apply(create_diagnostic_col)
data["sex"] = data["gender"].map({"female": 0, "male": 1})

In [6]:
# features by cluster
patient = [
           "height",
           "weight",
           "sex",
           "age",
           "take_drugs",
           "has_symptom"
]


position = [
            "FEATURE_confidence_ellipse_area_ML_AND_AP_opened_eyes",
            "FEATURE_maximum_value_ML_opened_eyes","FEATURE_maximum_value_AP_opened_eyes",
            "FEATURE_maximum_value_Radius_opened_eyes","FEATURE_mean_distance_ML_opened_eyes",
            "FEATURE_mean_distance_AP_opened_eyes",
            "FEATURE_mean_distance_Radius_opened_eyes",
            "FEATURE_RMS_ML_opened_eyes",
            "FEATURE_RMS_AP_opened_eyes",
            "FEATURE_RMS_Radius_opened_eyes",
            "FEATURE_amplitude_ML_opened_eyes",
            "FEATURE_amplitude_AP_opened_eyes",
            "FEATURE_amplitude_ML_AND_AP_opened_eyes",
            "FEATURE_sway_length_ML_opened_eyes",
            "FEATURE_sway_length_AP_opened_eyes",
            "FEATURE_sway_length_ML_AND_AP_opened_eyes",
            "FEATURE_Coefficient_sway_direction_ML_AND_AP_opened_eyes",
                      
            'FEATURE_maximum_value_AP_closed_eyes',
            'FEATURE_maximum_value_Radius_closed_eyes', 
            'FEATURE_mean_distance_ML_closed_eyes', 
            'FEATURE_mean_distance_AP_closed_eyes', 
            'FEATURE_mean_distance_Radius_closed_eyes', 
            'FEATURE_RMS_ML_closed_eyes', 
            'FEATURE_RMS_AP_closed_eyes', 
            'FEATURE_RMS_Radius_closed_eyes',
            'FEATURE_amplitude_ML_closed_eyes', 
            'FEATURE_amplitude_AP_closed_eyes', 
            'FEATURE_amplitude_ML_AND_AP_closed_eyes', 
            'FEATURE_sway_length_ML_closed_eyes', 
            'FEATURE_sway_length_AP_closed_eyes', 
            'FEATURE_sway_length_ML_AND_AP_closed_eyes', 
            'FEATURE_length_over_area_ML_AND_AP_closed_eyes', 
            'FEATURE_fractal_dimension_pd_ML_AND_AP_closed_eyes',
          
            'FEATURE_fractal_dimension_pd_ML_AND_AP_opened_eyes',
            'FEATURE_length_over_area_ML_AND_AP_opened_eyes',
            'FEATURE_maximum_value_ML_closed_eyes',
            'FEATURE_confidence_ellipse_area_ML_AND_AP_closed_eyes',
            'FEATURE_length_over_area_ML_AND_AP_opened_eyes'
            ]

dynamic = [
        
           "FEATURE_zero_crossing_SPD_ML_opened_eyes",
           "FEATURE_zero_crossing_SPD_AP_opened_eyes",
           "FEATURE_principal_sway_direction_ML_AND_AP_opened_eyes",
           "FEATURE_mean_velocity_ML_opened_eyes",
           "FEATURE_mean_velocity_AP_opened_eyes",
           "FEATURE_mean_velocity_ML_AND_AP_opened_eyes",
           "FEATURE_Coefficient_sway_direction_ML_AND_AP_opened_eyes",
           "FEATURE_planar_deviation_ML_AND_AP_opened_eyes",
           "FEATURE_peak_velocity_all_SPD_ML_opened_eyes",
           "FEATURE_peak_velocity_all_SPD_AP_opened_eyes",
           "FEATURE_peak_velocity_pos_SPD_ML_opened_eyes",
           "FEATURE_peak_velocity_pos_SPD_AP_opened_eyes",
 
           'FEATURE_mean_velocity_ML_closed_eyes', 
           'FEATURE_mean_velocity_AP_closed_eyes', 
           'FEATURE_mean_velocity_ML_AND_AP_closed_eyes', 
           'FEATURE_Coefficient_sway_direction_ML_AND_AP_closed_eyes', 
           'FEATURE_Quotient_both_direction_ML_AND_AP_closed_eyes', 
           'FEATURE_planar_deviation_ML_AND_AP_closed_eyes', 
           'FEATURE_peak_velocity_all_SPD_ML_closed_eyes', 
           'FEATURE_peak_velocity_all_SPD_AP_closed_eyes', 
           'FEATURE_peak_velocity_pos_SPD_ML_closed_eyes', 
           'FEATURE_peak_velocity_pos_SPD_AP_closed_eyes', 
           'FEATURE_peak_velocity_neg_SPD_ML_closed_eyes', 
           'FEATURE_peak_velocity_neg_SPD_AP_closed_eyes', 
           'FEATURE_mean_peak_Sway_Density_closed_eyes', 
           'FEATURE_mean_distance_peak_Sway_Density_closed_eyes', 
           'FEATURE_sway_area_per_second_ML_AND_AP_closed_eyes', 
           'FEATURE_phase_plane_parameters_ML_closed_eyes', 
           'FEATURE_phase_plane_parameters_AP_closed_eyes', 
           'FEATURE_fractal_dimension_cc_ML_AND_AP_closed_eyes', 
           'FEATURE_fractal_dimension_ce_ML_AND_AP_closed_eyes',
      
           'FEATURE_mean_frequency_ML_closed_eyes', 
           'FEATURE_mean_frequency_AP_closed_eyes', 
           'FEATURE_mean_frequency_ML_AND_AP_closed_eyes',
        
           'FEATURE_mean_frequency_ML_opened_eyes',
           'FEATURE_mean_frequency_AP_opened_eyes',
           'FEATURE_mean_frequency_ML_AND_AP_opened_eyes',
           'FEATURE_peak_velocity_neg_SPD_ML_opened_eyes',
           'FEATURE_peak_velocity_neg_SPD_AP_opened_eyes',
           'FEATURE_fractal_dimension_cc_ML_AND_AP_opened_eyes',
           'FEATURE_fractal_dimension_ce_ML_AND_AP_opened_eyes',
           'FEATURE_mean_peak_Sway_Density_opened_eyes',
           'FEATURE_phase_plane_parameters_AP_opened_eyes',
           'FEATURE_phase_plane_parameters_ML_opened_eyes',
           'FEATURE_sway_area_per_second_ML_AND_AP_opened_eyes',
           'FEATURE_zero_crossing_SPD_AP_closed_eyes',
           'FEATURE_zero_crossing_SPD_ML_closed_eyes',
           'FEATURE_principal_sway_direction_ML_AND_AP_closed_eyes'
           ]

frequency = [
             "FEATURE_frequency_mode_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_frequency_mode_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_total_power_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_total_power_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_power_frequency_50_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_power_frequency_50_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_power_frequency_95_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_power_frequency_95_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_centroid_frequency_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_centroid_frequency_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_frequency_dispersion_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_frequency_dispersion_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_energy_content_0_05_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_energy_content_0_05_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_energy_content_05_2_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_energy_content_05_2_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_energy_content_2_inf_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_energy_content_2_inf_Power_Spectrum_Density_AP_opened_eyes",
             "FEATURE_frequency_quotient_Power_Spectrum_Density_ML_opened_eyes",
             "FEATURE_frequency_quotient_Power_Spectrum_Density_AP_opened_eyes",
        
             'FEATURE_frequency_mode_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_frequency_mode_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_power_frequency_50_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_power_frequency_50_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_power_frequency_95_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_power_frequency_95_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_centroid_frequency_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_centroid_frequency_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_frequency_dispersion_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_frequency_dispersion_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_energy_content_0_05_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_energy_content_0_05_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_energy_content_05_2_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_energy_content_05_2_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_energy_content_2_inf_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_energy_content_2_inf_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_frequency_quotient_Power_Spectrum_Density_ML_closed_eyes',
             'FEATURE_frequency_quotient_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_total_power_Power_Spectrum_Density_AP_closed_eyes',
             'FEATURE_total_power_Power_Spectrum_Density_ML_closed_eyes'
             ]

stochastic = [
     
              'FEATURE_short_time_diffusion_Diffusion_ML_closed_eyes', 
              'FEATURE_long_time_diffusion_Diffusion_ML_closed_eyes', 
              'FEATURE_critical_time_Diffusion_ML_closed_eyes', 
              'FEATURE_long_time_scaling_Diffusion_ML_closed_eyes', 
              'FEATURE_short_time_diffusion_Diffusion_AP_closed_eyes', 
              'FEATURE_long_time_diffusion_Diffusion_AP_closed_eyes', 
              'FEATURE_critical_time_Diffusion_AP_closed_eyes', 
              'FEATURE_long_time_scaling_Diffusion_AP_closed_eyes', 
              'FEATURE_critical_displacement_Diffusion_ML_closed_eyes', 
              'FEATURE_critical_displacement_Diffusion_AP_closed_eyes',
  
              'FEATURE_short_time_diffusion_Diffusion_ML_opened_eyes',
              'FEATURE_long_time_diffusion_Diffusion_ML_opened_eyes',
              'FEATURE_critical_time_Diffusion_ML_opened_eyes',
              'FEATURE_critical_displacement_Diffusion_ML_opened_eyes',
              'FEATURE_long_time_scaling_Diffusion_ML_opened_eyes',
              'FEATURE_short_time_diffusion_Diffusion_AP_opened_eyes',
              'FEATURE_long_time_diffusion_Diffusion_AP_opened_eyes',
              'FEATURE_critical_time_Diffusion_AP_opened_eyes',
              'FEATURE_critical_displacement_Diffusion_AP_opened_eyes',
              'FEATURE_long_time_scaling_Diffusion_AP_opened_eyes'
              ]

#patient = pd.Index(patient).unique()
position = pd.Index(position).unique()
dynamic = pd.Index(dynamic).unique()
frequency = pd.Index(frequency).unique()
stochastic = pd.Index(stochastic).unique()

clusters = dict(
    #patient=patient,
    position= position,
    dynamic=dynamic,
    frequency=frequency,
    stochastic=stochastic
)

In [7]:
for arr in [patient, position, dynamic, frequency, stochastic]:
  print(len(arr))

6
37
48
40
20


In [8]:
clusters_vif_scores = dict()

for cluster_name, cols in clusters.items():
  # select columns from data
  X = data[cols]
  
  # create dataframe that will hold
  # features name and corresponding vif_score
  vif_data = pd.DataFrame()
  vif_data["features"] = cols
  vif_data["VIF"] = [VIF(X.values, i)
                          for i in range(len(X.columns))]

  # save a copy of this dataframe
  clusters_vif_scores[cluster_name] = vif_data.copy()

In [9]:
for cluster_name in clusters_vif_scores.keys():
  # sort columns asc by vif score
  clusters_vif_scores[cluster_name].sort_values(by="VIF", inplace=True)

In [10]:
for cluster_name in clusters_vif_scores.keys():
  print("{0}: {1:.2f}".format(cluster_name, clusters_vif_scores[cluster_name]["VIF"].mean()))

position: 5966.43
dynamic: 3990.92
frequency: 845.45
stochastic: 13.13


In [11]:
clusters_vif_scores["position"] # 4 + 13 + 20 + 18 = 22 + 38 = 55

Unnamed: 0,features,VIF
16,FEATURE_Coefficient_sway_direction_ML_AND_AP_o...,9.779151
31,FEATURE_length_over_area_ML_AND_AP_closed_eyes,18.136142
34,FEATURE_length_over_area_ML_AND_AP_opened_eyes,20.398937
36,FEATURE_confidence_ellipse_area_ML_AND_AP_clos...,53.079569
0,FEATURE_confidence_ellipse_area_ML_AND_AP_open...,142.192481
1,FEATURE_maximum_value_ML_opened_eyes,399.0869
10,FEATURE_amplitude_ML_opened_eyes,464.75689
28,FEATURE_sway_length_ML_closed_eyes,511.004784
35,FEATURE_maximum_value_ML_closed_eyes,593.382845
2,FEATURE_maximum_value_AP_opened_eyes,665.590042


In [12]:
# apply logistic regression on each cluster

# create target variable
data["faller"] = data["total_fall_count"].apply(lambda entry: 1 if entry > 1 else 0)

In [13]:
# scoring the model
def computeScore(model, Y, X):
  treshold = 0.3
  prediction = [1 if prob > treshold else 0 for prob in  model.predict(X)]

  return accuracy_score(Y, prediction)

In [14]:
log_score_by_cluster = dict()

for cluster_name, cols in clusters.items():
  # features / target variables
  X_train = data[cols]
  Y_train = data["faller"]

  # fit model
  log_reg = sm.Logit(Y_train, X_train).fit(disp=False)
  score = computeScore(log_reg, Y_train, X_train)

  log_score_by_cluster[cluster_name] = score

In [15]:
log_score_by_cluster

{'dynamic': 0.83, 'frequency': 0.83, 'position': 0.81, 'stochastic': 0.67}

In [16]:
0.67/0.8
40/3.87
clusters["position"][:10]

Index(['FEATURE_confidence_ellipse_area_ML_AND_AP_opened_eyes',
       'FEATURE_maximum_value_ML_opened_eyes',
       'FEATURE_maximum_value_AP_opened_eyes',
       'FEATURE_maximum_value_Radius_opened_eyes',
       'FEATURE_mean_distance_ML_opened_eyes',
       'FEATURE_mean_distance_AP_opened_eyes',
       'FEATURE_mean_distance_Radius_opened_eyes',
       'FEATURE_RMS_ML_opened_eyes', 'FEATURE_RMS_AP_opened_eyes',
       'FEATURE_RMS_Radius_opened_eyes'],
      dtype='object')

In [17]:
# features / target variables
cols = clusters["position"][:10].union(clusters["dynamic"][:10]).union(clusters["frequency"][:10]).union(clusters["stochastic"][:int(10 * 0.87)])

X_train = data[cols]
Y_train = data["faller"]

# fit model
log_reg = sm.Logit(Y_train, X_train).fit(disp=False)
score = computeScore(log_reg, Y_train, X_train)
score


Maximum Likelihood optimization failed to converge. Check mle_retvals



0.8

In [18]:
cluster = clusters_vif_scores["position"]

fig = px.box(cluster, y="VIF")
fig.show()

In [19]:
for cluster_name in clusters_vif_scores.keys():
  cluster = clusters_vif_scores[cluster_name]
  length = len(cluster[cluster["VIF"] <= cluster["VIF"].mean() / 2])

  print("{0}: {1}".format(cluster_name, length))

position: 20
dynamic: 26
frequency: 28
stochastic: 9


In [20]:
col_name = "frequency"
inverse_sum = 0
for val in clusters_vif_scores[col_name]["VIF"].values:
  inverse_sum += 1 / val

inverse_sum * len(clusters_vif_scores[col_name])

35.24298194173841

In [21]:
# keep variables whose threshold is less than
threshold = 100

for cluster_name in clusters_vif_scores.keys():
  cluster = clusters_vif_scores[cluster_name]

  cluster = cluster[cluster["VIF"] <= threshold]

  clusters_vif_scores[cluster_name] = cluster


In [22]:
clusters_vif_scores["stochastic"]

Unnamed: 0,features,VIF
14,FEATURE_long_time_scaling_Diffusion_ML_opened_...,2.390279
3,FEATURE_long_time_scaling_Diffusion_ML_closed_...,2.439369
7,FEATURE_long_time_scaling_Diffusion_AP_closed_...,2.95477
16,FEATURE_long_time_diffusion_Diffusion_AP_opene...,3.7623
19,FEATURE_long_time_scaling_Diffusion_AP_opened_...,4.007305
5,FEATURE_long_time_diffusion_Diffusion_AP_close...,4.144286
12,FEATURE_critical_time_Diffusion_ML_opened_eyes,4.6159
4,FEATURE_short_time_diffusion_Diffusion_AP_clos...,5.792585
2,FEATURE_critical_time_Diffusion_ML_closed_eyes,6.454194
15,FEATURE_short_time_diffusion_Diffusion_AP_open...,8.706411


In [23]:
fig = go.Figure(data=[
                     go.Table(
                         header=dict(values=[cluster_name for cluster_name in clusters_vif_scores.keys()]),
                         cells=dict(
                                  values=[cluster["features"].values for cluster in clusters_vif_scores.values()], 
                                  font_size=7, 
                                  align='left'
                                  ))
                     
])

fig.update_layout(height=650)
fig.show()

In [24]:
# how much feature were selected in each cluster
final_clusters_proportions = pd.DataFrame(columns=["cluster", "proportion"])


for cluster_name in clusters.keys():
  # compute proportion
  selection_proportion = len(clusters_vif_scores[cluster_name]) / len(clusters[cluster_name])
  
  final_clusters_proportions = final_clusters_proportions.append({'cluster': cluster_name, "proportion": selection_proportion}, ignore_index=True)

In [25]:
fig = px.bar(final_clusters_proportions, x="cluster", y="proportion", title="Selected proportion in each cluster")

fig.update_layout(
    yaxis=dict(
        tickformat=',.0%',
        range=[0, 1]
    )
)
fig.show()


In [26]:
# proportion of each cluster in the final select 
final_clusters_proportions = pd.DataFrame(columns=["cluster", "proportion"])

# compute total nb of featues
total_nb_featues = 0
for cluster_name in clusters.keys():
  total_nb_featues += len(clusters_vif_scores[cluster_name])

for cluster_name in clusters.keys():
  # compute proportion
  proportion = len(clusters_vif_scores[cluster_name]) / total_nb_featues
  
  final_clusters_proportions = final_clusters_proportions.append({'cluster': cluster_name, "proportion": proportion}, ignore_index=True)


In [27]:
final_clusters_proportions

Unnamed: 0,cluster,proportion
0,position,0.078431
1,dynamic,0.176471
2,frequency,0.352941
3,stochastic,0.392157


In [28]:
# in the final bucket of variables, here is the proportion of each cluster
fig = px.pie(final_clusters_proportions, names="cluster", values="proportion", title="Proportion of each cluster after VIF selection")

fig.show()

In [29]:
# final set of variables
arr_final_selected_features = []

for cluster in clusters_vif_scores.values():
  for feat in cluster["features"].values:
    arr_final_selected_features.append(feat)

In [30]:
arr_final_selected_features

['FEATURE_Coefficient_sway_direction_ML_AND_AP_opened_eyes',
 'FEATURE_length_over_area_ML_AND_AP_closed_eyes',
 'FEATURE_length_over_area_ML_AND_AP_opened_eyes',
 'FEATURE_confidence_ellipse_area_ML_AND_AP_closed_eyes',
 'FEATURE_principal_sway_direction_ML_AND_AP_closed_eyes',
 'FEATURE_principal_sway_direction_ML_AND_AP_opened_eyes',
 'FEATURE_Coefficient_sway_direction_ML_AND_AP_closed_eyes',
 'FEATURE_Coefficient_sway_direction_ML_AND_AP_opened_eyes',
 'FEATURE_mean_peak_Sway_Density_closed_eyes',
 'FEATURE_Quotient_both_direction_ML_AND_AP_closed_eyes',
 'FEATURE_mean_peak_Sway_Density_opened_eyes',
 'FEATURE_sway_area_per_second_ML_AND_AP_closed_eyes',
 'FEATURE_mean_distance_peak_Sway_Density_closed_eyes',
 'FEATURE_frequency_mode_Power_Spectrum_Density_AP_closed_eyes',
 'FEATURE_frequency_mode_Power_Spectrum_Density_ML_opened_eyes',
 'FEATURE_frequency_mode_Power_Spectrum_Density_ML_closed_eyes',
 'FEATURE_frequency_mode_Power_Spectrum_Density_AP_opened_eyes',
 'FEATURE_energy

In [31]:
len(arr_final_selected_features)

51

In [32]:
clusters_vif_scores

{'dynamic':                                              features        VIF
 47  FEATURE_principal_sway_direction_ML_AND_AP_clo...   4.732976
 2   FEATURE_principal_sway_direction_ML_AND_AP_ope...   6.366988
 15  FEATURE_Coefficient_sway_direction_ML_AND_AP_c...   7.344140
 6   FEATURE_Coefficient_sway_direction_ML_AND_AP_o...   8.724558
 24         FEATURE_mean_peak_Sway_Density_closed_eyes  26.075229
 16  FEATURE_Quotient_both_direction_ML_AND_AP_clos...  50.905653
 41         FEATURE_mean_peak_Sway_Density_opened_eyes  55.531788
 26  FEATURE_sway_area_per_second_ML_AND_AP_closed_...  85.119222
 25  FEATURE_mean_distance_peak_Sway_Density_closed...  97.610208,
 'frequency':                                              features        VIF
 21  FEATURE_frequency_mode_Power_Spectrum_Density_...   8.975784
 0   FEATURE_frequency_mode_Power_Spectrum_Density_...   9.861026
 20  FEATURE_frequency_mode_Power_Spectrum_Density_...   9.994511
 1   FEATURE_frequency_mode_Power_Spectrum_Density_