# Collecte des données

* Démographiques

https://hub.worldpop.org/project/categories?id=3

https://population.un.org/wpp/downloads?folder=Standard%20Projections&group=Most%20used

https://dhsprogram.com/data/available-datasets.cfm

* Économiques

https://data.imf.org/en/Datasets#t=coveo117bcfc4&sort=%40idata_publication_date%20descending

https://www.oecd.org/en/data.html

https://unctadstat.unctad.org/EN/

* Sociales

https://www.who.int/data/gho

https://hdr.undp.org/data-center

https://genderdata.worldbank.org/en/home

In [1]:
import pandas as pd
import re

## Données démographiques et sanitaires

In [2]:
dem = pd.read_excel("./WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT.xlsx", 
                    sheet_name="Estimates", skiprows=16, index_col="Index")

Extraire les données du BENIN

In [3]:
def get_one_value_var(database):
    entete = database.columns
    one_val_var = []
    for col in entete:
        if len(database[col].value_counts()) == 1 or database[col].isna().sum() == len(database):
            one_val_var.append(col)
    return one_val_var

def extract_region_data(dataset, region: str):
    extrated_df = dataset[dataset['Region, subregion, country or area *'] == region]
    extrated_df = extrated_df.drop(get_one_value_var(extrated_df), axis = 1)
    return extrated_df

In [4]:
benin_data = extract_region_data(dem, "Benin")

Véririfer qu'il n'y a pas de données manquantes

In [5]:
benin_data.isna().sum().sort_values(ascending=False)

Year                                                                                              0
Total Population, as of 1 January (thousands)                                                     0
Total Population, as of 1 July (thousands)                                                        0
Male Population, as of 1 July (thousands)                                                         0
Female Population, as of 1 July (thousands)                                                       0
Population Density, as of 1 July (persons per square km)                                          0
Population Sex Ratio, as of 1 July (males per 100 females)                                        0
Median Age, as of 1 July (years)                                                                  0
Natural Change, Births minus Deaths (thousands)                                                   0
Rate of Natural Change (per 1,000 population)                                                     0


In [6]:
benin_data.sample(5)

Unnamed: 0_level_0,Year,"Total Population, as of 1 January (thousands)","Total Population, as of 1 July (thousands)","Male Population, as of 1 July (thousands)","Female Population, as of 1 July (thousands)","Population Density, as of 1 July (persons per square km)","Population Sex Ratio, as of 1 July (males per 100 females)","Median Age, as of 1 July (years)","Natural Change, Births minus Deaths (thousands)","Rate of Natural Change (per 1,000 population)",...,"Male Mortality before Age 60 (deaths under age 60 per 1,000 male live births)","Female Mortality before Age 60 (deaths under age 60 per 1,000 female live births)","Mortality between Age 15 and 50, both sexes (deaths under age 50 per 1,000 alive at age 15)","Male Mortality between Age 15 and 50 (deaths under age 50 per 1,000 males alive at age 15)","Female Mortality between Age 15 and 50 (deaths under age 50 per 1,000 females alive at age 15)","Mortality between Age 15 and 60, both sexes (deaths under age 60 per 1,000 alive at age 15)","Male Mortality between Age 15 and 60 (deaths under age 60 per 1,000 males alive at age 15)","Female Mortality between Age 15 and 60 (deaths under age 60 per 1,000 females alive at age 15)",Net Number of Migrants (thousands),"Net Migration Rate (per 1,000 population)"
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5957,1982.0,4096.347,4156.596,2021.254,2135.343,36.862,94.657,16.082,122.747,29.531,...,559.263,462.27,231.943,269.63,196.824,349.968,403.589,300.638,-2.253,-0.542
5929,1954.0,2316.926,2328.782,1095.56,1233.222,20.653,88.837,21.501,28.32,12.16,...,696.956,633.562,312.22,338.858,287.432,450.108,490.058,413.258,-4.602,-1.976
5989,2014.0,10866.292,11030.004,5507.544,5522.459,97.818,99.73,16.999,323.756,29.352,...,383.134,351.373,172.021,179.334,164.757,273.881,288.357,259.876,3.667,0.332
5942,1967.0,2843.024,2872.997,1373.149,1499.848,25.479,91.553,18.216,64.706,22.523,...,635.268,566.412,276.969,304.837,252.096,407.293,448.377,369.621,-4.756,-1.655
5939,1964.0,2677.744,2704.003,1287.274,1416.729,23.98,90.862,18.86,57.353,21.21,...,653.048,580.913,285.518,315.257,258.835,418.027,461.277,377.882,-4.839,-1.79


In [7]:
selected_cols = ['Year', 'Total Population, as of 1 July (thousands)',
       'Male Population, as of 1 July (thousands)',
       'Female Population, as of 1 July (thousands)','Births (thousands)',
       'Total Deaths (thousands)', 
       'Life Expectancy at Birth, both sexes (years)',
       'Net Number of Migrants (thousands)',
       'Net Migration Rate (per 1,000 population)']

In [8]:
benin_data.columns

Index(['Year', 'Total Population, as of 1 January (thousands)',
       'Total Population, as of 1 July (thousands)',
       'Male Population, as of 1 July (thousands)',
       'Female Population, as of 1 July (thousands)',
       'Population Density, as of 1 July (persons per square km)',
       'Population Sex Ratio, as of 1 July (males per 100 females)',
       'Median Age, as of 1 July (years)',
       'Natural Change, Births minus Deaths (thousands)',
       'Rate of Natural Change (per 1,000 population)',
       'Population Change (thousands)', 'Population Growth Rate (percentage)',
       'Population Annual Doubling Time (years)', 'Births (thousands)',
       'Births by women aged 15 to 19 (thousands)',
       'Crude Birth Rate (births per 1,000 population)',
       'Total Fertility Rate (live births per woman)',
       'Net Reproduction Rate (surviving daughters per woman)',
       'Mean Age Childbearing (years)',
       'Sex Ratio at Birth (males per 100 female births)',
      

## Données démographique par âge

In [9]:
dem_age = pd.read_excel("./WPP_POP_5-YEAR_AGE_GROUPS.xlsx", sheet_name="Estimates", skiprows=16, index_col="Index")

In [10]:
benin_age = extract_region_data(dem_age, "Benin")

In [11]:
benin_age.tail()

Unnamed: 0_level_0,Year,0-4,5-9,10-14,15-19,20-24,25-29,30-34,35-39,40-44,...,55-59,60-64,65-69,70-74,75-79,80-84,85-89,90-94,95-99,100+
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5994,2019.0,2092.3305,1807.271,1557.247,1327.1545,1151.6015,980.569,832.724,693.7485,553.619,...,294.6915,217.8055,156.9945,111.1135,68.977,33.205,12.255,3.018,0.439,0.035
5995,2020.0,2116.112,1859.698,1599.0325,1365.8045,1177.034,1013.8375,856.8085,718.8575,574.4505,...,306.682,226.844,162.393,113.1305,71.357,34.4405,12.509,3.032,0.433,0.0335
5996,2021.0,2135.8595,1910.8,1640.477,1408.263,1206.1245,1043.758,881.233,742.482,597.6365,...,317.9935,236.4485,168.0235,115.467,73.373,35.6995,12.699,3.005,0.414,0.0305
5997,2022.0,2154.968,1957.322,1683.2985,1453.222,1236.6395,1073.106,905.375,765.456,622.3625,...,328.8635,246.8635,174.3795,118.432,75.0465,37.0095,13.0405,3.0435,0.4085,0.029
5998,2023.0,2176.4425,1996.616,1730.4715,1496.5645,1269.0645,1102.765,929.7095,788.971,647.879,...,339.823,257.763,181.4615,121.8785,76.7765,38.473,13.547,3.148,0.418,0.0295


## Fusion des deux tables de données

In [12]:
print(benin_data.shape)
print(benin_age.shape)


(74, 55)
(74, 22)


Fusion en utilisant l'année

In [13]:
benin_df = pd.merge(benin_data[selected_cols], benin_age, on="Year")
benin_df.reset_index(drop=True)

Unnamed: 0,Year,"Total Population, as of 1 July (thousands)","Male Population, as of 1 July (thousands)","Female Population, as of 1 July (thousands)",Births (thousands),Total Deaths (thousands),"Life Expectancy at Birth, both sexes (years)",Net Number of Migrants (thousands),"Net Migration Rate (per 1,000 population)",0-4,...,55-59,60-64,65-69,70-74,75-79,80-84,85-89,90-94,95-99,100+
0,1950.0,2250.476,1055.264,1195.212,92.031,71.297,35.396,-4.603,-2.045,326.9945,...,87.864,74.406,62.897,51.86,41.239,16.287,2.7755,0.229,0.007,0
1,1951.0,2267.308,1063.923,1203.385,93.173,71.04,35.695,-4.602,-2.03,330.945,...,88.1015,74.216,61.74,49.562,39.362,17.354,3.3815,0.2135,0.009,0
2,1952.0,2285.775,1073.427,1212.349,94.854,70.852,35.988,-4.603,-2.014,335.994,...,88.331,74.2695,60.792,47.8355,36.919,18.4065,3.8535,0.214,0.0115,0
3,1953.0,2306.202,1083.943,1222.259,96.677,70.627,36.315,-4.603,-1.996,341.5675,...,88.5395,74.496,60.061,46.534,34.3775,19.1325,4.2615,0.261,0.011,0
4,1954.0,2328.782,1095.56,1233.222,98.721,70.401,36.666,-4.602,-1.976,348.317,...,88.7355,74.7795,59.584,45.4775,32.2505,19.2105,4.677,0.3645,0.0085,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,2019.0,12726.755,6372.298,6354.457,458.619,118.81,59.888,3.366,0.264,2092.3305,...,294.6915,217.8055,156.9945,111.1135,68.977,33.205,12.255,3.018,0.439,0.035
70,2020.0,13070.169,6546.526,6523.643,461.781,119.849,60.154,1.729,0.132,2116.112,...,306.682,226.844,162.393,113.1305,71.357,34.4405,12.509,3.032,0.433,0.0335
71,2021.0,13413.417,6720.676,6692.742,466.51,125.401,59.61,1.729,0.129,2135.8595,...,317.9935,236.4485,168.0235,115.467,73.373,35.6995,12.699,3.005,0.414,0.0305
72,2022.0,13759.501,6896.458,6863.042,472.635,123.117,60.475,-0.192,-0.014,2154.968,...,328.8635,246.8635,174.3795,118.432,75.0465,37.0095,13.0405,3.0435,0.4085,0.029


In [14]:
benin_df['ID'] = benin_df.index

benin_df = benin_df[[benin_df.columns[-1]] + list(benin_df.columns[:-1])]

In [15]:
benin_df.columns[23:32]

Index(['65-69', '70-74', '75-79', '80-84', '85-89', '90-94', '95-99', '100+'], dtype='object')

Exporter les données en parquet ou Excel

In [16]:
benin_df["Population 0-14"] = benin_df[benin_df.columns[11:14]].sum(axis=1)
benin_df["Population 15-59"] = benin_df[benin_df.columns[14:23]].sum(axis=1)
benin_df["Population 60+"] = benin_df[benin_df.columns[23:32]].sum(axis=1)

benin_df = benin_df.drop(benin_df.columns[11:32], axis=1)


In [17]:
benin_df.to_excel("Donnees_ben_pop.xlsx", index=False)

In [18]:
benin_df.sample()

Unnamed: 0,ID,Year,"Total Population, as of 1 July (thousands)","Male Population, as of 1 July (thousands)","Female Population, as of 1 July (thousands)",Births (thousands),Total Deaths (thousands),"Life Expectancy at Birth, both sexes (years)",Net Number of Migrants (thousands),"Net Migration Rate (per 1,000 population)",0-4,Population 15-59,Population 60+
36,36,1986.0,4675.42,2280.915,2394.505,220.177,78.133,50.632,-1.899,-0.406,884.2315,1864.5415,1926.647


In [19]:
benin_df[:5].to_parquet("Donnees_ben_pop.parquet", engine="pyarrow")

In [20]:
benin_df.columns

Index(['ID', 'Year', 'Total Population, as of 1 July (thousands)',
       'Male Population, as of 1 July (thousands)',
       'Female Population, as of 1 July (thousands)', 'Births (thousands)',
       'Total Deaths (thousands)',
       'Life Expectancy at Birth, both sexes (years)',
       'Net Number of Migrants (thousands)',
       'Net Migration Rate (per 1,000 population)', '0-4', 'Population 15-59',
       'Population 60+'],
      dtype='object')

Glossaire des variables

In [28]:
glossaire = pd.DataFrame({
    'nom': ['ID', 'Year', 'Total Population, as of 1 July (thousands)',
       'Male Population, as of 1 July (thousands)', 'Female Population, as of 1 July (thousands)', 
       'Births (thousands)', 'Total Deaths (thousands)', 'Life Expectancy at Birth, both sexes (years)',
       'Net Number of Migrants (thousands)', 'Net Migration Rate (per 1,000 population)', 
       'Population 0-14', 'Population 15-59', 'Population 60+'],

    'définitions': ['Identifiant', "L'année de la donnée", "Effectif de la population au 1er juin",
                    "Effectif des Hommes au 1er juin", "Effectif des Femmes au 1er juin", 
                    "Naissances total", "Décès total", "Expérience de vie à la naissances",
                    "Solde migratoire",  "Taux migratoire net", "Population 0 - 14 ans",
                    "Population 15 - 59 ans", "Population 60+"],
    'unité': ['nombre', "nombre d'année", "1000 habitants", "1000 habitants", "1000 habitants",
              "1000 naissances", "1000 décès", "nombre d'années", "1000 personnes", "1000 habitants",
              "habitant", "habitant", "habitant" ],
     'source' : ["https://population.un.org/wpp/downloads?folder=Standard%20Projections&group=Most%20used"]*13,
     'periode': [str(min(benin_df.Year)) + ' - ' + str(max(benin_df.Year))]*13,
     'géographie': ["Benin"]*13})

In [30]:
glossaire.to_csv('Glossaire_des_variables.csv', index=False)

In [25]:
min(benin_df.Year)

1950.0