# Exploration — Démographie

Exploration des 3 fichiers de données démographiques :
- **Population historique** : `base-pop-historiques-1876-2023.xlsx`
- **Naissances** : `DS_ETAT_CIVIL_NAIS_COMMUNES_data.csv`
- **Décès** : `DS_ETAT_CIVIL_DECES_COMMUNES_data.csv`

In [1]:
import pandas as pd
import os

DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), '')  # dossier parent = demographie/
print(f"Dossier données : {DATA_DIR}")

Dossier données : /Users/cedricsanchez/Documents/MSPR/MSPR_1/data/input/demographie/


## 1. Population historique (1876–2023)

In [2]:
pop_file = os.path.join(DATA_DIR, 'base-pop-historiques-1876-2023.xlsx')
pop = pd.read_excel(pop_file)
print(f"Shape : {pop.shape}")
print(f"Colonnes : {list(pop.columns)}")
pop.head()

Shape : (34882, 41)
Colonnes : ['Chiffres détaillés - Séries historiques de population (1876 à 2023)', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40']


Unnamed: 0,Chiffres détaillés - Séries historiques de population (1876 à 2023),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
0,France hors Mayotte - Communes,,,,,,,,,,...,,,,,,,,,,
1,Mise en ligne : décembre 2025 - Géographie au ...,,,,,,,,,,...,,,,,,,,,,
2,"©Insee - Source(s) : Insee, recensements de la...",,,,,,,,,,...,,,,,,,,,,
3,Code géographique,Région,Département,Libellé géographique,Population en 2023,Population en 2022,Population en 2021,Population en 2020,Population en 2019,Population en 2018,...,Population en 1926,Population en 1921,Population en 1911,Population en 1906,Population en 1901,Population en 1896,Population en 1891,Population en 1886,Population en 1881,Population en 1876
4,CODGEO,REG,DEP,LIBGEO,PMUN2023,PMUN2022,PMUN2021,PMUN2020,PMUN2019,PMUN2018,...,PTOT1926,PTOT1921,PTOT1911,PTOT1906,PTOT1901,PTOT1896,PTOT1891,PTOT1886,PTOT1881,PTOT1876


In [3]:
pop.info()
print()
pop.describe()

<class 'pandas.DataFrame'>
RangeIndex: 34882 entries, 0 to 34881
Data columns (total 41 columns):
 #   Column                                                               Non-Null Count  Dtype 
---  ------                                                               --------------  ----- 
 0   Chiffres détaillés - Séries historiques de population (1876 à 2023)  34882 non-null  str   
 1   Unnamed: 1                                                           34879 non-null  str   
 2   Unnamed: 2                                                           34879 non-null  str   
 3   Unnamed: 3                                                           34880 non-null  str   
 4   Unnamed: 4                                                           34879 non-null  object
 5   Unnamed: 5                                                           34879 non-null  object
 6   Unnamed: 6                                                           34879 non-null  object
 7   Unnamed: 7              

Unnamed: 0,Chiffres détaillés - Séries historiques de population (1876 à 2023),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
count,34882,34879,34879,34880,34879,34879,34879,34879,34879,34879,...,34407,34407,34407,34407,34407,34407,34407,34407,34407,34407
unique,34882,19,102,32643,5887,5894,5873,5868,5878,5812,...,4043,3947,4132,4113,4101,4060,4043,4078,4020,4003
top,France hors Mayotte - Communes,44,62,Sainte-Colombe,143,99,144,122,139,87,...,166,215,330,208,200,405,200,502,300,300
freq,1,5115,887,12,81,78,72,78,76,78,...,75,79,71,73,79,70,69,75,72,64


## 2. Naissances par commune

In [4]:
nais_file = os.path.join(DATA_DIR, 'DS_ETAT_CIVIL_NAIS_COMMUNES_data.csv')
nais = pd.read_csv(nais_file, sep=';', low_memory=False)
print(f"Shape : {nais.shape}")
print(f"Colonnes : {list(nais.columns)}")
nais.head()

Shape : (710821, 7)
Colonnes : ['EC_MEASURE', 'FREQ', 'GEO', 'GEO_OBJECT', 'OBS_STATUS', 'TIME_PERIOD', 'OBS_VALUE']


Unnamed: 0,EC_MEASURE,FREQ,GEO,GEO_OBJECT,OBS_STATUS,TIME_PERIOD,OBS_VALUE
0,LVB,A,68170,COM,A,2018,5.0
1,LVB,A,68185,COM,A,2015,14.0
2,LVB,A,68181,COM,A,2018,1.0
3,LVB,A,68177,COM,A,2021,9.0
4,LVB,A,68191,COM,A,2011,4.0


In [5]:
nais.info()
print()
nais.describe()

<class 'pandas.DataFrame'>
RangeIndex: 710821 entries, 0 to 710820
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   EC_MEASURE   710821 non-null  str    
 1   FREQ         710821 non-null  str    
 2   GEO          710821 non-null  str    
 3   GEO_OBJECT   710821 non-null  str    
 4   OBS_STATUS   710821 non-null  str    
 5   TIME_PERIOD  710821 non-null  int64  
 6   OBS_VALUE    710593 non-null  float64
dtypes: float64(1), int64(1), str(5)
memory usage: 47.1 MB



Unnamed: 0,TIME_PERIOD,OBS_VALUE
count,710821.0,710593.0
mean,2016.0,210.711676
std,4.898983,6367.167088
min,2008.0,0.0
25%,2012.0,2.0
50%,2016.0,6.0
75%,2020.0,26.0
max,2024.0,831495.0


In [6]:
# Métadonnées
meta_nais = pd.read_csv(os.path.join(DATA_DIR, 'DS_ETAT_CIVIL_NAIS_COMMUNES_metadata.csv'), sep=';', low_memory=False)
meta_nais.head(20)

Unnamed: 0,COD_VAR,LIB_VAR,COD_MOD,LIB_MOD
0,EC_MEASURE,Mesure état civil,LVB,Nombre de naissances vivantes
1,FREQ,Fréquence,A,Annuel
2,OBS_STATUS,Statut de l'observation,A,Normale
3,OBS_STATUS,Statut de l'observation,M,Valeur manquante : les données n’existent pas ...
4,TIME_PERIOD,Période temporelle,2014,2014
5,TIME_PERIOD,Période temporelle,2015,2015
6,TIME_PERIOD,Période temporelle,2016,2016
7,TIME_PERIOD,Période temporelle,2017,2017
8,TIME_PERIOD,Période temporelle,2018,2018
9,TIME_PERIOD,Période temporelle,2019,2019


## 3. Décès par commune

In [7]:
deces_file = os.path.join(DATA_DIR, 'DS_ETAT_CIVIL_DECES_COMMUNES_data.csv')
deces = pd.read_csv(deces_file, sep=';', low_memory=False)
print(f"Shape : {deces.shape}")
print(f"Colonnes : {list(deces.columns)}")
deces.head()

Shape : (710821, 7)
Colonnes : ['EC_MEASURE', 'FREQ', 'GEO', 'GEO_OBJECT', 'OBS_STATUS', 'TIME_PERIOD', 'OBS_VALUE']


Unnamed: 0,EC_MEASURE,FREQ,GEO,GEO_OBJECT,OBS_STATUS,TIME_PERIOD,OBS_VALUE
0,DTH,A,97602,COM,M,2010,
1,DTH,A,97602,COM,M,2011,
2,DTH,A,97602,COM,M,2012,
3,DTH,A,97602,COM,M,2013,
4,DTH,A,97603,COM,M,2009,


In [8]:
deces.info()
print()
deces.describe()

<class 'pandas.DataFrame'>
RangeIndex: 710821 entries, 0 to 710820
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   EC_MEASURE   710821 non-null  str    
 1   FREQ         710821 non-null  str    
 2   GEO          710821 non-null  str    
 3   GEO_OBJECT   710821 non-null  str    
 4   OBS_STATUS   710821 non-null  str    
 5   TIME_PERIOD  710821 non-null  int64  
 6   OBS_VALUE    710593 non-null  float64
dtypes: float64(1), int64(1), str(5)
memory usage: 47.1 MB



Unnamed: 0,TIME_PERIOD,OBS_VALUE
count,710821.0,710593.0
mean,2016.0,162.572033
std,4.898983,4929.168571
min,2008.0,0.0
25%,2012.0,2.0
50%,2016.0,5.0
75%,2020.0,29.0
max,2024.0,673048.0


## 4. Qualité des données

In [9]:
for nom, df in [('Population', pop), ('Naissances', nais), ('Décès', deces)]:
    nulls = df.isnull().sum()
    total = nulls.sum()
    print(f"{nom} — {total} valeurs manquantes sur {df.shape[0] * df.shape[1]}")
    if total > 0:
        for col, n in nulls[nulls > 0].items():
            print(f"  {col}: {n} ({100*n/len(df):.1f}%)")

Population — 5449 valeurs manquantes sur 1430162
  Unnamed: 1: 3 (0.0%)
  Unnamed: 2: 3 (0.0%)
  Unnamed: 3: 2 (0.0%)
  Unnamed: 4: 3 (0.0%)
  Unnamed: 5: 3 (0.0%)
  Unnamed: 6: 3 (0.0%)
  Unnamed: 7: 3 (0.0%)
  Unnamed: 8: 3 (0.0%)
  Unnamed: 9: 3 (0.0%)
  Unnamed: 10: 3 (0.0%)
  Unnamed: 11: 3 (0.0%)
  Unnamed: 12: 3 (0.0%)
  Unnamed: 13: 3 (0.0%)
  Unnamed: 14: 3 (0.0%)
  Unnamed: 15: 3 (0.0%)
  Unnamed: 16: 3 (0.0%)
  Unnamed: 17: 3 (0.0%)
  Unnamed: 18: 3 (0.0%)
  Unnamed: 19: 3 (0.0%)
  Unnamed: 20: 3 (0.0%)
  Unnamed: 21: 3 (0.0%)
  Unnamed: 22: 3 (0.0%)
  Unnamed: 23: 3 (0.0%)
  Unnamed: 24: 3 (0.0%)
  Unnamed: 25: 3 (0.0%)
  Unnamed: 26: 3 (0.0%)
  Unnamed: 27: 3 (0.0%)
  Unnamed: 28: 29 (0.1%)
  Unnamed: 29: 115 (0.3%)
  Unnamed: 30: 475 (1.4%)
  Unnamed: 31: 475 (1.4%)
  Unnamed: 32: 475 (1.4%)
  Unnamed: 33: 475 (1.4%)
  Unnamed: 34: 475 (1.4%)
  Unnamed: 35: 475 (1.4%)
  Unnamed: 36: 475 (1.4%)
  Unnamed: 37: 475 (1.4%)
  Unnamed: 38: 475 (1.4%)
  Unnamed: 39: 475 (1.4%)
 