In [11]:
# Import librairies

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

import plotly.express as px

In [12]:
# URL of the CSV file of INSEE data
insee_url = 'https://medical-deserts-project.s3.eu-north-1.amazonaws.com/insee_clean.csv'

# Read the CSV file from the URL into a DataFrame
insee_df = pd.read_csv(insee_url, sep = ',', encoding='utf-8')

In [13]:
pd.set_option("display.max_columns", None)

In [14]:
# Remove useless columns
insee_df = insee_df.drop(["APL aux médecins généralistes de 65 ans et moins", "APL aux médecins généralistes de 62 ans et moins"], axis=1)

# APL column at the end of dataset
APL_column = insee_df.pop("APL aux médecins généralistes (sans borne d'âge)")
insee_df["APL aux médecins généralistes (sans borne d'âge)"] = APL_column

# Rename APL column
insee_df.rename(columns={"APL aux médecins généralistes (sans borne d'âge)": "APL"}, inplace=True)


In [15]:
print('Number of rows :', insee_df.shape[0])
print('Number of columns :', insee_df.shape[1])
print()

# Show first rows of the dataset
print('First rows of the dataset :')
display(insee_df.head())
print()

# Dataset statistics
print('Basics statistics :')
summary_stats_all = insee_df.describe(include='all')
display(summary_stats_all)
print()

# Missing values percentage
missing_percentages = (insee_df.isna().mean() * 100).round(2)
print('Percentage of missing values: ')
print(missing_percentages)

Number of rows : 38576
Number of columns : 90

First rows of the dataset :


Unnamed: 0,Dynamique Entrepreneuriale,Dynamique Entrepreneuriale Service et Commerce,Synergie Médicale COMMUNE,Indice Synergie Médicale,SEG Croissance POP,Nb Omnipraticiens BV,Nb Infirmiers Libéraux BV,Nb dentistes Libéraux BV,Nb pharmaciens Libéraux BV,Densité Médicale BV,Score équipement de santé BV,Indice Démographique,Indice Ménages,Nb Ménages,Nb Résidences Principales,Nb propriétaire,Nb Logement,Nb Résidences Secondaires,Nb Log Vacants,Nb Occupants Résidence Principale,Nb Entreprises Secteur Services,Nb Entreprises Secteur Commerce,Nb Entreprises Secteur Construction,Nb Entreprises Secteur Industrie,Nb Création Enteprises,Nb Création Industrielles,Nb Création Construction,Nb Création Commerces,Nb Création Services,Moyenne Revenus Fiscaux Départementaux,Moyenne Revenus Fiscaux Régionaux,Dep Moyenne Salaires Horaires,Dep Moyenne Salaires Cadre Horaires,Dep Moyenne Salaires Prof Intermédiaire Horaires,Dep Moyenne Salaires Employé Horaires,Dep Moyenne Salaires Ouvrié Horaires,Reg Moyenne Salaires Horaires,Reg Moyenne Salaires Cadre Horaires,Reg Moyenne Salaires Prof Intermédiaire Horaires,Reg Moyenne Salaires Employé Horaires,Reg Moyenne Salaires Ouvrié Horaires,Valeur ajoutée régionale,Score Urbanité,Nb Atifs,Nb Actifs Salariés,Nb Actifs Non Salariés,Nb Logement Secondaire et Occasionnel,Nb Hotel,Capacité Hotel,Nb Camping,Capacité Camping,Dynamique Démographique BV,Taux Propriété,Dynamique Démographique INSEE,Capacité Fisc,Moyenne Revnus fiscaux,"Nb Education, santé, action sociale",Nb Services personnels et domestiques,"Nb Santé, action sociale",Nb Industries des biens intermédiaires,Nb de Commerce,Nb de Services aux particuliers,"Nb institution de Education, santé, action sociale, administration",PIB Régionnal,Score Croissance Entrepreneuriale,Environnement Démographique,Fidélité,Seg Cap Fiscale,Seg Dyn Entre,DYN SetC,latitude,longitude,Population en 2014 (princ),Pop 0-14 ans en 2014 (princ),Pop 15-29 ans en 2014 (princ),Pop 30-44 ans en 2014 (princ),Pop 45-59 ans en 2014 (princ),Pop 60-74 ans en 2014 (princ),Pop 75-89 ans en 2014 (princ),Pop 15 ans ou plus en 2014 (compl),Pop 15 ans ou plus Agriculteurs exploitants en 2014 (compl),"Pop 15 ans ou plus Artisans, Comm., Chefs entr. en 2014 (compl)","Pop 15 ans ou plus Cadres, Prof. intel. sup. en 2014 (compl)",Pop 15 ans ou plus Prof. intermédiaires en 2014 (compl),Pop 15 ans ou plus Employés en 2014 (compl),Pop 15 ans ou plus Ouvriers en 2014 (compl),Pop 15 ans ou plus Retraités en 2014 (compl),Pop 15 ans ou plus Autres en 2014 (compl),taux chômage(15-64 ans),APL
0,57.0,23.0,114,114.56713,en croissance démographique,9,14,7,7,0.09286,4,44.19769,37.22029,247,248,196,289,32,9,728,7.0,11.0,2.0,2.0,4.0,0.0,2.0,1.0,1.0,12509,10458,11.410345,21.963793,12.558621,8.743103,9.268966,11.873022,21.78783,12.704057,8.783164,9.30142,86957.45836,0.0,295.0,254.0,41.0,32.0,0.0,0.0,0.0,0.0,1.Accroissement par excédent naturel et migrat...,67,-1,117,11483.5,3.0,1.0,0.0,9364,9350,3372,15105,173681,0.01585,Bassin Industriel en croissance démographique,Pop Sédentaire,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,46.153721,4.92585,767.0,161.0,102.0,132.0,189.0,125.0,53.0,605.0,15.0,20.0,75.0,95.0,100.0,125.0,145.0,30.0,8.776596,2.293
1,45.0,4.0,143,143.71141,en croissance démographique,31,36,18,18,0.099229,4,10.18071,10.09619,67,67,61,142,71,4,168,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,12509,10458,11.410345,21.963793,12.558621,8.743103,9.268966,11.873022,21.78783,12.704057,8.783164,9.30142,86957.45836,0.0,57.0,49.0,8.0,71.0,0.0,0.0,0.0,0.0,1.Accroissement par excédent naturel et migrat...,42,0,110,11483.5,0.0,0.0,0.0,9364,9350,3372,15105,173681,0.00173,Bassin Résidentiel en croissance démographique,Pop Sédentaire,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,46.009606,5.428088,239.0,54.0,28.0,69.0,36.0,33.0,17.0,192.0,0.0,20.0,15.0,25.0,39.0,10.0,64.0,20.0,8.130081,2.6
2,634.0,828.0,366,367.8208,en croissance démographique,31,36,18,18,0.099229,4,696.92134,699.19896,4640,4635,1968,5184,135,414,11015,342.0,301.0,58.0,108.0,83.0,4.0,14.0,27.0,38.0,12509,10458,11.410345,21.963793,12.558621,8.743103,9.268966,11.873022,21.78783,12.704057,8.783164,9.30142,86957.45836,37.5,4556.0,4203.0,353.0,135.0,2.0,52.0,0.0,0.0,1.Accroissement par excédent naturel et migrat...,37,-55,250,11483.5,113.0,41.0,118.0,9364,9350,3372,15105,173681,0.38471,Bassin Résidentiel en croissance démographique,Pop Mobile,Fiscalité moyenne,Dynamique Economique,Bonne Dynamique Entreprise Serv et Com,45.961049,5.372275,14022.0,2778.0,2958.0,2642.0,2603.0,1853.0,1045.0,11228.0,3.0,334.0,743.0,1777.0,1918.0,1818.0,2878.0,1757.0,15.859777,4.079
3,113.0,62.0,132,132.65668,en croissance démographique,12,12,6,6,1.0,1,85.774,71.2761,473,473,344,505,14,18,1406,22.0,26.0,17.0,10.0,6.0,0.0,4.0,2.0,0.0,12509,10458,11.410345,21.963793,12.558621,8.743103,9.268966,11.873022,21.78783,12.704057,8.783164,9.30142,86957.45836,0.0,621.0,535.0,86.0,14.0,2.0,17.0,1.0,72.0,Grande Ville,68,-3,127,11483.5,5.0,2.0,7.0,9364,9350,3372,15105,173681,0.02824,Bassin Urbain en croissance démographique,Pop Mobile,Fiscalité moyenne,Moyenne dynamique,Faible Dynamique Serv et Com,45.996164,4.911967,1627.0,336.0,251.0,323.0,376.0,232.0,99.0,1337.0,5.0,60.0,106.0,231.0,241.0,231.0,312.0,151.0,7.875895,4.378
4,42.0,1.0,121,121.60196,en croissance démographique,26,21,10,10,0.100905,3,5.24276,6.17827,41,41,28,57,13,3,86,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12509,10458,11.410345,21.963793,12.558621,8.743103,9.268966,11.873022,21.78783,12.704057,8.783164,9.30142,86957.45836,0.0,37.0,33.0,4.0,13.0,0.0,0.0,0.0,0.0,3.Accroissement par excédent migratoire,49,0,109,11483.5,0.0,0.0,0.0,9364,9350,3372,15105,173681,0.0,Bassin Résidentiel en croissance démographique,Pop Sédentaire,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,45.749886,5.594585,109.0,12.0,16.0,15.0,29.0,27.0,10.0,89.0,0.0,0.0,5.0,25.0,15.0,15.0,25.0,5.0,13.793103,1.069



Basics statistics :


Unnamed: 0,Dynamique Entrepreneuriale,Dynamique Entrepreneuriale Service et Commerce,Synergie Médicale COMMUNE,Indice Synergie Médicale,SEG Croissance POP,Nb Omnipraticiens BV,Nb Infirmiers Libéraux BV,Nb dentistes Libéraux BV,Nb pharmaciens Libéraux BV,Densité Médicale BV,Score équipement de santé BV,Indice Démographique,Indice Ménages,Nb Ménages,Nb Résidences Principales,Nb propriétaire,Nb Logement,Nb Résidences Secondaires,Nb Log Vacants,Nb Occupants Résidence Principale,Nb Entreprises Secteur Services,Nb Entreprises Secteur Commerce,Nb Entreprises Secteur Construction,Nb Entreprises Secteur Industrie,Nb Création Enteprises,Nb Création Industrielles,Nb Création Construction,Nb Création Commerces,Nb Création Services,Moyenne Revenus Fiscaux Départementaux,Moyenne Revenus Fiscaux Régionaux,Dep Moyenne Salaires Horaires,Dep Moyenne Salaires Cadre Horaires,Dep Moyenne Salaires Prof Intermédiaire Horaires,Dep Moyenne Salaires Employé Horaires,Dep Moyenne Salaires Ouvrié Horaires,Reg Moyenne Salaires Horaires,Reg Moyenne Salaires Cadre Horaires,Reg Moyenne Salaires Prof Intermédiaire Horaires,Reg Moyenne Salaires Employé Horaires,Reg Moyenne Salaires Ouvrié Horaires,Valeur ajoutée régionale,Score Urbanité,Nb Atifs,Nb Actifs Salariés,Nb Actifs Non Salariés,Nb Logement Secondaire et Occasionnel,Nb Hotel,Capacité Hotel,Nb Camping,Capacité Camping,Dynamique Démographique BV,Taux Propriété,Dynamique Démographique INSEE,Capacité Fisc,Moyenne Revnus fiscaux,"Nb Education, santé, action sociale",Nb Services personnels et domestiques,"Nb Santé, action sociale",Nb Industries des biens intermédiaires,Nb de Commerce,Nb de Services aux particuliers,"Nb institution de Education, santé, action sociale, administration",PIB Régionnal,Score Croissance Entrepreneuriale,Environnement Démographique,Fidélité,Seg Cap Fiscale,Seg Dyn Entre,DYN SetC,latitude,longitude,Population en 2014 (princ),Pop 0-14 ans en 2014 (princ),Pop 15-29 ans en 2014 (princ),Pop 30-44 ans en 2014 (princ),Pop 45-59 ans en 2014 (princ),Pop 60-74 ans en 2014 (princ),Pop 75-89 ans en 2014 (princ),Pop 15 ans ou plus en 2014 (compl),Pop 15 ans ou plus Agriculteurs exploitants en 2014 (compl),"Pop 15 ans ou plus Artisans, Comm., Chefs entr. en 2014 (compl)","Pop 15 ans ou plus Cadres, Prof. intel. sup. en 2014 (compl)",Pop 15 ans ou plus Prof. intermédiaires en 2014 (compl),Pop 15 ans ou plus Employés en 2014 (compl),Pop 15 ans ou plus Ouvriers en 2014 (compl),Pop 15 ans ou plus Retraités en 2014 (compl),Pop 15 ans ou plus Autres en 2014 (compl),taux chômage(15-64 ans),APL
count,38576.0,38576.0,38576.0,38576.0,38576,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576,38576,38576,38576,38576,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0,38576.0
unique,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7,,,,,,,,,,,,,,7,2,3,3,3,,,,,,,,,,,,,,,,,,,,
top,,,,,en croissance démographique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Grande Ville,,,,,,,,,,,,,,Bassin Résidentiel en croissance démographique,Pop Sédentaire,Fiscalité moyenne,Faible dynamique,Faible Dynamique Serv et Com,,,,,,,,,,,,,,,,,,,,
freq,,,,,38565,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9123,,,,,,,,,,,,,,17697,30438,30171,30467,32282,,,,,,,,,,,,,,,,,,,,
mean,122.093167,132.014024,107.735742,108.271714,,16.672646,16.543836,8.038988,8.038988,0.31314,2.528515,130.133013,133.704616,887.285955,887.175005,457.164455,1074.553453,110.615668,76.76278,2081.439678,53.873833,48.939781,12.889932,17.898045,11.404345,0.693981,1.876892,2.932601,5.900871,10349.106776,10253.619349,10.99644,21.204162,12.464463,8.662701,9.053762,11.167984,21.281109,12.519966,8.686431,9.115633,41735.138478,8.469968,822.849362,724.222651,98.626711,110.569914,0.672776,23.243156,0.279008,34.064963,,57.414792,-7.441,8.590782,10301.363063,14.455905,4.665569,15.33596,3259.855999,4631.189885,1881.930708,8076.808378,81334.919743,0.045917,,,,,,46.974716,2.774623,2404.173191,430.888247,453.318799,458.693203,467.897579,363.179982,203.003914,1973.350633,15.348066,67.01882,170.972029,276.058741,324.295417,253.348844,533.767135,332.536499,10.819374,3.292278
std,632.939857,949.082363,230.597328,231.744519,,12.97084,14.427645,6.930421,6.930421,0.383306,1.320722,706.132472,821.804825,5453.632782,5443.768855,2018.993544,6354.429882,683.663897,632.807508,11207.84642,388.525148,351.531235,62.464869,100.053537,82.528927,3.720384,13.867551,19.894781,46.369087,1564.611392,1086.862459,0.949745,0.807586,0.377534,0.244611,0.399423,0.750101,0.592146,0.298957,0.204756,0.295548,46200.312428,21.96032,4279.091054,3861.881555,433.520712,683.579165,4.142705,209.655629,1.204615,246.955424,,15.615377,65.959787,2267.892741,1052.72349,112.100745,30.748857,113.248105,2586.4039,5042.427225,3397.955004,6994.424096,89335.816967,0.319263,,,,,,2.206678,2.690385,12566.038376,2029.18458,3402.794341,2486.930668,2120.737944,1621.884032,983.467221,10561.713588,29.972771,310.62047,1410.582045,1566.382588,1708.314737,1066.107542,2372.517992,2401.960258,4.839887,1.279927
min,1.0,1.0,13.0,13.06467,,1.0,0.0,0.0,0.0,0.032518,0.0,0.06096,0.15069,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6282.0,7748.0,9.8625,19.75,11.74,8.261905,8.3,10.57451,20.509549,12.17538,8.453394,8.725,3021.411002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,-1929.0,-140594.0,7664.0,0.0,0.0,0.0,30.0,342.0,187.0,828.0,6358.0,0.0,,,,,,41.435023,-5.086014,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,6.0,52.0,52.25869,,9.0,8.0,4.0,4.0,0.083808,1.0,11.58284,11.15102,74.0,74.0,58.0,103.0,9.0,5.0,189.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,9523.0,9716.0,10.4,20.686207,12.220833,8.51875,8.790698,10.679167,21.017098,12.339819,8.569608,8.915652,20389.52153,0.0,72.0,53.0,16.0,9.0,0.0,0.0,0.0,0.0,,48.0,-1.0,72.0,9643.0,0.0,0.0,0.0,1935.0,2425.0,776.0,4406.0,40484.0,0.0,,,,,,45.183013,0.669909,214.0,37.0,27.0,39.0,47.0,37.0,17.0,175.0,0.0,5.0,5.0,20.0,25.0,25.0,52.0,20.0,7.636364,2.508
50%,33.0,13.0,72.0,72.35819,,12.0,12.0,6.0,6.0,0.104593,3.0,26.39669,25.01445,166.0,166.0,127.0,219.0,23.0,13.0,428.0,6.0,5.0,4.0,3.0,2.0,0.0,0.0,0.0,1.0,10159.0,10458.0,10.708276,21.0525,12.437931,8.622581,9.038462,10.992609,21.101282,12.435217,8.634906,9.082927,30848.56372,0.0,172.0,136.0,33.0,23.0,0.0,0.0,0.0,0.0,,59.0,0.0,81.0,10304.5,0.0,0.0,0.0,2576.0,3333.0,925.0,6903.0,55218.0,0.00346,,,,,,47.400258,2.68282,507.5,94.0,68.0,97.0,110.0,84.0,39.0,410.0,8.0,17.0,20.0,53.0,65.0,63.0,119.0,48.0,10.089762,3.229
75%,75.0,46.0,105.0,105.52236,,19.0,19.0,9.0,9.0,0.180355,4.0,68.09492,64.34439,427.0,428.0,310.0,533.0,56.0,32.0,1094.0,20.0,16.0,9.0,10.0,5.0,0.0,1.0,1.0,2.0,11072.0,11150.0,11.321212,21.695833,12.640404,8.767647,9.2925,11.295855,21.640496,12.673103,8.783164,9.239535,38930.86531,0.0,448.0,373.0,74.0,56.0,0.0,0.0,0.0,0.0,,69.0,0.0,96.0,10942.5,4.0,2.0,5.0,3268.0,4858.0,1552.0,8683.0,79920.0,0.01931,,,,,,48.826465,4.958818,1340.0,253.0,187.0,255.0,286.0,222.0,111.0,1084.0,18.0,46.0,61.0,150.0,176.0,161.0,324.0,133.25,13.300854,4.009



Percentage of missing values: 
Dynamique Entrepreneuriale                        0.0
Dynamique Entrepreneuriale Service et Commerce    0.0
Synergie Médicale COMMUNE                         0.0
Indice Synergie Médicale                          0.0
SEG Croissance POP                                0.0
                                                 ... 
Pop 15 ans ou plus Ouvriers en 2014 (compl)       0.0
Pop 15 ans ou plus Retraités  en 2014 (compl)     0.0
Pop 15 ans ou plus Autres en 2014 (compl)         0.0
taux chômage(15-64 ans)                           0.0
APL                                               0.0
Length: 90, dtype: float64


In [16]:
# Separate target variable Y from features X
target_name = "APL"
print("Separating labels from features...")
Y = insee_df.loc[:, target_name]
X = insee_df.drop(target_name, axis=1)  # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()

Separating labels from features...
...Done.
0    2.293
1    2.600
2    4.079
3    4.378
4    1.069
Name: APL, dtype: float64

   Dynamique Entrepreneuriale  Dynamique Entrepreneuriale Service et Commerce  \
0                        57.0                                            23.0   
1                        45.0                                             4.0   
2                       634.0                                           828.0   
3                       113.0                                            62.0   
4                        42.0                                             1.0   

   Synergie Médicale COMMUNE  Indice Synergie Médicale  \
0                        114                 114.56713   
1                        143                 143.71141   
2                        366                 367.82080   
3                        132                 132.65668   
4                        121                 121.60196   

            SEG Croissance POP  Nb Omn

In [17]:
#Train-test splitting

print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [18]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i, t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)):
        numeric_features.append(i)
    else:
        categorical_features.append(i)
print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Indice Synergie Médicale', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Nb pharmaciens Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Indice Ménages', 'Nb Ménages', 'Nb Résidences Principales', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Occupants Résidence Principale', 'Nb Entreprises Secteur Services', 'Nb Entreprises Secteur Commerce', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Enteprises', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Commerces', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne Salaires Prof Intermédiaire 

In [19]:
#Multiple Transformations preprocessing

# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [20]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5,:])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head())
X_test = preprocessor.transform(X_test) # Don't fit again !!
print('...Done.')
print(X_test[0:5,:])
print()

Performing preprocessings on train set...
       Dynamique Entrepreneuriale  \
25185                        14.0   
33579                       252.0   
13641                        18.0   
38500                       218.0   
2626                         12.0   

       Dynamique Entrepreneuriale Service et Commerce  \
25185                                             2.0   
33579                                           110.0   
13641                                             5.0   
38500                                           106.0   
2626                                              3.0   

       Synergie Médicale COMMUNE  Indice Synergie Médicale  \
25185                         76                  76.37809   
33579                        310                 311.54221   
13641                         60                  60.29849   
38500                        283                 284.40789   
2626                          52                  52.25869   

                SEG

ValueError: Found unknown categories ['Bassin diversifié en déclin démographique'] in column 2 during transform

In [None]:
# Train model
model = LinearRegression() # or LniearRegression pour regression lineaire par exemple
print("Training model...")
model.fit(X_train, Y_train)  # Training is always done on train set !!
print("...Done.")

Training model...


...Done.


In [None]:
# Print R^2 scores
print("R2 score on training set : ", model.score(X_train, Y_train))
print("R2 score on test set : ", model.score(X_test, Y_test))

R2 score on training set :  0.23950416394577834
R2 score on test set :  0.24198399764242406


In [None]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = model.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = model.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[3.58245964 2.91580922 3.53371472 ... 3.28474898 2.84877295 3.011329  ]

Predictions on test set...
...Done.
[2.9179048  3.16282363 3.45585878 ... 4.10110373 3.3192888  2.08720578]



In [None]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: # loop over pipelines
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

# Create a pandas DataFrame
coefs = pd.DataFrame(index = column_names, data = model.coef_.transpose(), columns=["coefficients"])
coefs

# Compute abs() and sort values
feature_importance = abs(coefs).sort_values(by = 'coefficients')
feature_importance

# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                 )
fig.show()

Names of columns corresponding to each coefficient:  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Indice Synergie Médicale', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Nb pharmaciens Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Indice Ménages', 'Nb Ménages', 'Nb Résidences Principales', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Occupants Résidence Principale', 'Nb Entreprises Secteur Services', 'Nb Entreprises Secteur Commerce', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Enteprises', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Commerces', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne