In [34]:
# Import librairies

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import plotly.figure_factory as ff

In [22]:
# URL of the CSV file of INSEE data
insee_url = 'https://medical-deserts-project.s3.eu-north-1.amazonaws.com/insee_clean.csv'

# Read the CSV file from the URL into a DataFrame
insee_df = pd.read_csv(insee_url, sep = ',', encoding='utf-8')
insee_df.shape

(38590, 90)

In [None]:
#variables = pd.read_csv("https://medical-deserts.s3.eu-west-3.amazonaws.com/variables_etude.csv")
#variables.shape

In [23]:
# Remove useless columns
insee_df = insee_df.drop(["APL aux médecins généralistes de 65 ans et moins", "APL aux médecins généralistes de 62 ans et moins"], axis=1)

# APL column at the end of dataset
APL_column = insee_df.pop("APL aux médecins généralistes (sans borne d'âge)")
insee_df["APL aux médecins généralistes (sans borne d'âge)"] = APL_column


In [None]:
# Select only numerical columns for the correlation matrix
# numeric_columns = insee_df.select_dtypes(include=['float64', 'int64'])

# # Calculate the correlation matrix
# corr_matrix = numeric_columns.corr().round(2)

# fig = ff.create_annotated_heatmap(corr_matrix.values,
#                                   x=corr_matrix.columns.tolist(),
#                                   y=corr_matrix.index.tolist())

# fig.update_layout(height=2400, width=3200)

# fig.show()

In [6]:
#to_drop = ["Nb Entreprises Secteur Services", "Nb Entreprises Secteur Commerce", "Nb Ménages", "Nb Résidences Principales", "Nb Occupants Résidence Principale", "Nb Création Commerces", "Nb Création Enteprises", "PIB Régionnal", "Nb de Commerce", "Nb Santé, action sociale", "Population en 2014 (princ)", "Pop 60-74 ans en 2014 (princ)", "Pop 75-89 ans en 2014 (princ)", "Nb Logement Secondaire et Occasionnel"]

In [None]:
# Remove columns to be dropped
# numeric_columns_filtered = insee_df.select_dtypes(include=['float64', 'int64']).drop(columns=to_drop)#no_keep)

# # Re-calculate the correlation matrix
# corr_matrix_filtered = numeric_columns_filtered.corr().round(2)

# print("New correlation matrix:")
# print(corr_matrix_filtered)

# fig = ff.create_annotated_heatmap(corr_matrix_filtered.values,
#                                   x=corr_matrix_filtered.columns.tolist(),
#                                   y=corr_matrix_filtered.index.tolist())

# fig.update_layout(height=1500, width=2800)

# fig.show()

In [4]:
insee_df.shape

(38590, 88)

In [24]:
# X, y split 
X = insee_df.loc[:, insee_df.columns != "APL aux médecins généralistes (sans borne d'âge)"]
y = insee_df.loc[:, "APL aux médecins généralistes (sans borne d'âge)"]

# X = X.select_dtypes(exclude=["object"])


In [25]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Nb pharmaciens Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb Ménages', 'Nb Résidences Principales', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Occupants Résidence Principale', 'Nb Entreprises Secteur Services', 'Nb Entreprises Secteur Commerce', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Enteprises', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Commerces', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horai

In [26]:
#categorical_transformer = OneHotEncoder(drop='first')
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))])

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [27]:
# Train_test_split 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [28]:
# Fit and transform X_train
X_train = preprocessor.fit_transform(X_train)
# Apply on X_test
X_test = preprocessor.transform(X_test)

# Visualize X_std_train
X_train



array([[-0.17436763, -0.1344679 , -0.26697432, ...,  0.        ,
         0.        ,  1.        ],
       [-0.16327625, -0.13658912, -0.16271739, ...,  0.        ,
         0.        ,  1.        ],
       [-0.15060039, -0.12492244, -0.17140547, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.16169177, -0.12174062, -0.3104147 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.17119867, -0.13658912, -0.27131835, ...,  0.        ,
         1.        ,  0.        ],
       [-0.15218487, -0.13128608, -0.20615778, ...,  0.        ,
         1.        ,  0.        ]])

In [33]:
# Instanciate RandomForestRegressor
rf = RandomForestRegressor()

In [35]:
rf.fit(X_train, y_train)

In [36]:
# Print R^2 scores
print("R2 score on training set : ", rf.score(X_train, y_train))
print("R2 score on test set : ", rf.score(X_test, y_test))

R2 score on training set :  0.9243368812005699
R2 score on test set :  0.45301035320888927


In [None]:
#R2 score on training set :  0.9243368812005699
#R2 score on test set :  0.45301035320888927

In [43]:
import joblib
joblib.dump(rf, "/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RF.pkl")
#joblib.load("/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RF.pkl")

['/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RF.pkl']

In [38]:
# Perform grid search
rf = RandomForestRegressor()

print("Grid search...")

# Grid of values to be tested
params = {
    'max_depth': [10, 12, 14],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [80, 100, 120],
    'min_samples_leaf': [1, 2, 4]
    }

gridsearch = GridSearchCV(rf, param_grid = params, cv = 3, verbose = 2)
gridsearch.fit(X_train, y_train)

print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)

Grid search...
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=  35.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=  35.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=  35.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  42.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  43.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  42.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=120; total time=  51.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=120; total time=  51.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=120; total time=  50.8s
[CV] END max_dept

In [None]:
#Best hyperparameters :  {'max_depth': 14, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 80}
#Best validation accuracy :  0.3914783302489721

In [39]:
print("R2 score on training set : ", gridsearch.score(X_train, y_train))
print("R2 score on test set : ", gridsearch.score(X_test, y_test))

R2 score on training set :  0.6867990401991656
R2 score on test set :  0.40804501003395977


In [None]:
# 
#R2 score on training set :  0.6867990401991656
#R2 score on test set :  0.40804501003395977

In [42]:
import joblib
joblib.dump(gridsearch, "/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RFGS.pkl")
#joblib.load("/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RFGS.pkl")

['/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RFGS.pkl']

In [47]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# Print MAE
print("Mean Absolute Error on training set : ", mean_absolute_error(y_train, Y_train_pred))
#print("Mean APL on training set : ", y_train.mean())
print()
print("Mean Absolute Error on test set : ", mean_absolute_error(y_test, Y_test_pred))
#print("Mean APL on test set : ", y_test.mean())
print("Standard-deviation on test set : ", y_test.std())

Predictions on training set...
...Done.
[3.49634229 2.54819827 3.28368151 ... 4.21846917 2.98841814 3.61371424]

Predictions on test set...
...Done.
[3.48604217 3.23038568 2.67382039 ... 5.13028786 3.13233631 2.69625144]

Mean Absolute Error on training set :  0.5439231476634226

Mean Absolute Error on test set :  0.7238389607374052
Standard-deviation on test set :  1.2814183580178466


ERROR of 0.72 on the prediction, NOT GOOD MODEL of PREDICTION

In [50]:
column_names = []
for name, step, features_list in preprocessor.transformers_: # loop over steps of ColumnTransformer
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = step.get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Nb pharmaciens Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb Ménages', 'Nb Résidences Principales', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Occupants Résidence Principale', 'Nb Entreprises Secteur Services', 'Nb Entreprises Secteur Commerce', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Enteprises', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Commerces', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep M

In [51]:
# Create a pandas DataFrame
feature_importance = pd.DataFrame(index = column_names, data = gridsearch.best_estimator_.feature_importances_, columns=["feature_importances"])
feature_importance = feature_importance.sort_values(by = 'feature_importances')

In [66]:
# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(height=2000, width=2500, showlegend = False, margin = {'l': 50}, ) # to avoid cropping of column names
fig.show()

According to the forward selection algorithm, the following features should be kept: 
['Dynamique Entrepreneuriale', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb Log Vacants', 'Moyenne Revenus Fiscaux Départementaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horaires', 'Dep Moyenne Salaires Ouvrié Horaires', 'Reg Moyenne Salaires Horaires', 'Reg Moyenne Salaires Cadre Horaires', 'Reg Moyenne Salaires Prof Intermédiaire Horaires', 'Reg Moyenne Salaires Employé Horaires', 'Valeur ajoutée régionale', 'Nb Hotel', 'Capacité Hotel', 'Nb Camping', 'Taux Propriété', 'Dynamique Démographique INSEE', 'Capacité Fisc', 'Moyenne Revnus fiscaux', 'Nb Education, santé, action sociale', 'Score Croissance Entrepreneuriale', 'latitude', 'longitude', 'Pop 15-29 ans en 2014 (princ)', 'Pop 15 ans ou plus Agriculteurs exploitants en 2014 (compl)', 'Pop 15 ans ou plus Ouvriers en 2014 (compl)', 'Po

In [None]:
from sklearn.feature_selection import  SequentialFeatureSelector
feature_selector =  SequentialFeatureSelector(rf, n_features_to_select = 30)
feature_selector.fit(X_train, y_train)

features_list = X.columns
best_features = features_list[feature_selector.support_]
print("According to the forward selection algorithm, the following features should be kept: ")
print(best_features.to_list())

In [26]:
X_best = X.loc[:, best_features]

In [None]:
# Divide dataset Train set & Test set

print("Dividing into train and test sets...")
X_train_best, X_test_best, Y_train_best, Y_test_best = train_test_split(X_best, y, test_size=0.2, random_state=0)
print("...Done.")

# Fit and transform X_train
X_train_best = preprocessor.fit_transform(X_train_best)

# Apply on X_test
X_test_best  = preprocessor.transform(X_test_best)

# Visualize X_train
X_train_best

In [None]:
# Perform grid search
rf_bestfeatures = RandomForestRegressor()

print("Grid search...")

# Grid of values to be tested
params = {
    'max_depth': [10, 12, 14],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [80, 100, 120],
    'min_samples_leaf': [1, 2, 4]
    }

gridsearch = GridSearchCV(rf_bestfeatures, param_grid = params, cv = 3, verbose = 2)
gridsearch.fit(X_train_best, Y_train_best)

print("...Done.")

print("Best hyperparameters : ",     gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)

# Print R^2 scores
print("R2 score on training set : ", rf_bestfeatures.score(X_train_best, Y_train_best))
print("R2 score on test set : ",     rf_bestfeatures.score(X_test_best, Y_test_best))

In [217]:
from sklearn.decomposition import PCA

# Instanciate PCA with 3 components
pca = PCA(n_components=3)

# Fit transform X_std_train
X_opt_train = pca.fit_transform(X_train_best)

# Apply on X_std_test
X_opt_test = pca.transform(X_test_best)

In [None]:
PC1 = X_opt_train[:, 0]
PC2 = X_opt_train[:, 1]
PC3 = X_opt_train[:, 2]

# Convert PC into a DataFrame
PC = pd.DataFrame(data=X_opt_train, columns=["PC1", "PC2", "PC3"])
# PC Head
PC.head()

In [None]:
# Use pca.explained_variance_ratio_
print("Explained Variance ration per PC: {}".format(pca.explained_variance_ratio_))
print("Total explained variance ratio: {}%".format(pca.explained_variance_ratio_.sum()))

In [None]:
# Fit the RF bestfeatures on the train set where the PCA was applied and checkout the score on the test
rf_bestfeatures.fit(X_opt_train, Y_train_best)
# Print R^2 scores
print("R2 score on training set fit on PCA: ", rf_bestfeatures.score(X_opt_train, Y_train_best))
print("R2 score on test set fit on PCA: ",     rf_bestfeatures.score(X_opt_test, Y_test_best))

In [None]:
# Import plotly.express and plotly.graph_objects
import plotly.express as px 
import plotly.graph_objects as go

# Use plotly express to plot train data
fig = px.scatter_3d(PC, x="PC1", y="PC2", z="PC3")

# Add trace with test data 
fig.add_trace(go.Scatter3d(x=X_opt_test[:, 0], 
                           y=X_opt_test[:, 1], 
                           z=X_opt_test[:, 2],
                           mode="markers",
                           name="test"
                          ))

# Render on notebook
fig.show()

In [None]:
#Example on train set coloring by SalePrice
df_plot = pd.DataFrame(X_opt_train)
df_plot.columns = ['PC1', 'PC2', 'PC3']
df_plot["APL aux médecins généralistes (sans borne d'âge)"] = list(Y_train_best)

fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3', color="APL aux médecins généralistes (sans borne d'âge)")
fig.show()