In [1]:
# Import librairies

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import plotly.figure_factory as ff

In [2]:
pd.set_option("display.max_columns", None)

In [4]:
# URL of the CSV file of INSEE data
insee_url = 'https://medical-deserts-project.s3.eu-north-1.amazonaws.com/insee_clean.csv'

# Read the CSV file from the URL into a DataFrame
insee_df_original = pd.read_csv(insee_url, sep = ',', encoding='utf-8')
insee_df_original.shape

(38590, 90)

In [10]:
insee_df = insee_df_original.copy()

In [11]:
# Remove useless columns
insee_df = insee_df.drop(["APL aux médecins généralistes de 65 ans et moins", "APL aux médecins généralistes de 62 ans et moins"], axis=1)

# APL column at the end of dataset
insee_df["APL aux médecins généralistes (sans borne d'âge)"] = insee_df.pop("APL aux médecins généralistes (sans borne d'âge)")
insee_df.rename(columns={"APL aux médecins généralistes (sans borne d'âge)": "APL"}, inplace=True)
insee_df.shape

(38590, 88)

In [12]:
insee_df.drop_duplicates(inplace = True)

In [13]:
to_drop = ["Nb pharmaciens Libéraux BV" ,"Nb Entreprises Secteur Services", "Nb Entreprises Secteur Commerce", "Nb Ménages", "Nb Résidences Principales", "Nb Occupants Résidence Principale", "Nb Création Commerces", "Nb Création Enteprises", "PIB Régionnal", "Nb de Commerce", "Nb Santé, action sociale", "Population en 2014 (princ)", "Pop 60-74 ans en 2014 (princ)", "Pop 75-89 ans en 2014 (princ)", "Nb Logement Secondaire et Occasionnel"]

# Remove columns to be dropped
insee_df = insee_df.drop(columns=to_drop)

In [14]:
insee_df.shape

(34760, 73)

## RF avec les meilleurs hyperparamètres sur toutes les features 

In [15]:
# X, y split 
X = insee_df.loc[:, insee_df.columns != "APL"]
y = insee_df.loc[:, "APL"]

In [16]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horaires', 'Dep Moyenne Salaires Ouvrié Horaires', 'Reg Moyenne Salaires Horaires', 'Reg Moyenne Salaires Cadre Horaires', 'Reg Moyenne Salaires Prof Intermédiaire Horaires', 'Reg Moyenne Salaires Employé Horaires', 'Reg Moyenne Salaire

In [17]:
# Train_test_split 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [18]:
#categorical_transformer = OneHotEncoder(drop='first')
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))])

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [19]:
# Fit and transform X_train
X_train = preprocessor.fit_transform(X_train)
# Apply on X_test
X_test = preprocessor.transform(X_test)

# Visualize X_std_train
X_train

array([[-0.02231907,  0.0064759 , -0.00263147, ...,  0.        ,
         1.        ,  0.        ],
       [-0.19154912, -0.15064951, -0.09054271, ...,  0.        ,
         0.        ,  1.        ],
       [-0.202126  , -0.148864  , -0.25284038, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.14426677, -0.1274378 ,  0.95762974, ...,  0.        ,
         1.        ,  0.        ],
       [-0.16775114, -0.13100884,  0.18671581, ...,  0.        ,
         1.        ,  0.        ],
       [-0.18626068, -0.13993642, -0.3069396 , ...,  0.        ,
         1.        ,  0.        ]])

In [20]:
rf = RandomForestRegressor(max_depth = 14, min_samples_leaf = 2, min_samples_split= 2, n_estimators= 80)

In [21]:
rf.fit(X_train, y_train)

In [22]:
# Print R^2 scores
print("R2 score on training set : ", rf.score(X_train, y_train))
print("R2 score on test set : ", rf.score(X_test, y_test))

R2 score on training set :  0.6724715138811237
R2 score on test set :  0.38561643343488716


In [23]:
column_names = []
for name, step, features_list in preprocessor.transformers_: # loop over steps of ColumnTransformer
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = step.get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horaires', 'Dep Moyenne Salaires Ouvrié Horaires', 'Reg Moyenne Salaires Horaires', 'Reg Moyenne Salaires Cadre Horaires', 'Reg Moyenne Salaires Prof Intermédiaire Horaires', 'Reg Moyenne Salaires Employé Ho

In [25]:
feature_importance = pd.DataFrame(index = column_names, data = rf.feature_importances_, columns=["feature_importances"])
feature_importance = feature_importance.sort_values(by = 'feature_importances')

In [26]:
# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(height=2000, width=2500, showlegend = False, margin = {'l': 50}, ) # to avoid cropping of column names
fig.show()

## RF avec les best features de claudine 

In [55]:
# X, y split 

# X = insee_df.loc[:, insee_df.columns != "APL aux médecins généralistes (sans borne d'âge)"]
# y = insee_df.loc[:, "APL aux médecins généralistes (sans borne d'âge)"]

# # X = X.select_dtypes(exclude=["object"])


#Avec les best features de claudine : 
# Train_test_split 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [57]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Dynamique Entrepreneuriale', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb Log Vacants', 'Moyenne Revenus Fiscaux Départementaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horaires', 'Dep Moyenne Salaires Ouvrié Horaires', 'Reg Moyenne Salaires Horaires', 'Reg Moyenne Salaires Cadre Horaires', 'Reg Moyenne Salaires Prof Intermédiaire Horaires', 'Reg Moyenne Salaires Employé Horaires', 'Valeur ajoutée régionale', 'Nb Hotel', 'Capacité Hotel', 'Nb Camping', 'Taux Propriété', 'Dynamique Démographique INSEE', 'Capacité Fisc', 'Moyenne Revnus fiscaux', 'Nb Education, santé, action sociale', 'Score Croissance Entrepreneuriale', 'latitude', 'longitude', 'Pop 15-29 ans en 2014 (princ)', 'Pop 15 ans ou plus Agriculteurs exploitants en 2014 (compl)', 'Pop 15 ans ou plus Ouvriers en 2014 (compl)', 'Pop 15 ans ou plus Retraités  en 2014 (compl)']
Found categorica

In [58]:
#categorical_transformer = OneHotEncoder(drop='first')
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))])

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [59]:
# Train_test_split 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [60]:
# Fit and transform X_train
X_train = preprocessor.fit_transform(X_train)
# Apply on X_test
X_test = preprocessor.transform(X_test)

# Visualize X_std_train
X_train

array([[-0.17436763, -0.42047067,  1.11545852, ..., -0.34755044,
        -0.23230742, -0.19290351],
       [-0.16327625,  1.7950183 , -1.1616317 , ..., -0.38169628,
        -0.23324853, -0.20903099],
       [-0.15060039,  1.7950183 , -1.1616317 , ..., -0.0060921 ,
        -0.18619329, -0.16871228],
       ...,
       [-0.16169177, -0.65681283, -0.40260163, ..., -0.38169628,
        -0.16643008, -0.15470683],
       [-0.17119867, -0.6454334 ,  1.11545852, ..., -0.38169628,
        -0.22948411, -0.21454829],
       [-0.15218487,  1.7950183 , -1.1616317 , ..., -0.51827962,
        -0.15984235, -0.18781062]])

In [62]:
# Instanciate RandomForestRegressor
rf = RandomForestRegressor(max_depth = 14, min_samples_leaf = 2, n_estimators= 120)

In [63]:
rf.fit(X_train, y_train)

In [64]:
# Print R^2 scores
print("R2 score on training set : ", rf.score(X_train, y_train))
print("R2 score on test set : ", rf.score(X_test, y_test))

R2 score on training set :  0.6795696345202598
R2 score on test set :  0.4141589107435919


R2 score on training set :  0.9129209024538553 


R2 score on test set :  0.4037448584655441