In [1]:
import polars as pl
import gc
from codes_mapping import *
from population_data import *

Aggregating by region...

In [2]:
# List of indicators
indicators = ['FLT_REM_MNT', 'FLT_PAI_MNT']
x_columns = ['BEN_RES_REG']
relevant_columns = indicators + x_columns

dates = {2024 : ['0' + str(i) if i<10 else str(i) for i in range(1, 13)]}

data = pl.DataFrame()
df_region = pl.DataFrame()

dataset_path = '/Data'

for year in dates.keys():
    for month in dates[year]:
        suffix = '.csv'

        # Current month    
        data = pl.read_csv(f'{dataset_path}/A{year}/A{year}{month}.csv', separator = ';', infer_schema_length=1000, columns = relevant_columns)
        
        # Computing RAC
        data = data.with_columns((pl.col('FLT_PAI_MNT') - pl.col('FLT_REM_MNT')).alias('RAC'))

        # Aggregating by region

        keys_to_group_by = ['BEN_RES_REG']
        all_indicators = indicators + ['RAC']
        all_keys = all_indicators + keys_to_group_by
        df_aggr = data[all_keys].group_by(keys_to_group_by).agg([pl.col(*all_indicators).sum()])

        df_region = df_region.vstack(df_aggr.group_by(['BEN_RES_REG']).agg(pl.col(*all_indicators).sum()))

        # Aggregating by category
        
        del data
        gc.collect()

# Aggregating by region last time and changing the code of region by its name

df_region = df_region[all_keys].group_by(keys_to_group_by).agg(pl.col(*all_indicators).sum())
df_region = df_region.with_columns(pl.col(*keys_to_group_by).replace_strict(dict_region))
df_region = df_region.filter(pl.col('BEN_RES_REG') != 'Inconnu')

pl.Config.set_tbl_rows(100)
print(df_region)

shape: (13, 4)
┌─────────────────────────────────┬─────────────┬─────────────┬──────────┐
│ BEN_RES_REG                     ┆ FLT_REM_MNT ┆ FLT_PAI_MNT ┆ RAC      │
│ ---                             ┆ ---         ┆ ---         ┆ ---      │
│ str                             ┆ f64         ┆ f64         ┆ f64      │
╞═════════════════════════════════╪═════════════╪═════════════╪══════════╡
│ Hauts-de-France - Nord-Pas-de-… ┆ 1.1957e10   ┆ 1.4140e10   ┆ 2.1830e9 │
│ Centre-Val de Loire             ┆ 4.6264e9    ┆ 5.6644e9    ┆ 1.0380e9 │
│ Normandie                       ┆ 6.3629e9    ┆ 7.4126e9    ┆ 1.0496e9 │
│ Aquitaine-Limousin-Poitou-Char… ┆ 1.2221e10   ┆ 1.4712e10   ┆ 2.4916e9 │
│ Bourgogne-Franche-Comté         ┆ 5.0850e9    ┆ 6.1677e9    ┆ 1.0827e9 │
│ Pays de la Loire                ┆ 6.3446e9    ┆ 7.6870e9    ┆ 1.3424e9 │
│ Provence-Alpes-Côte d'Azur et … ┆ 1.4145e10   ┆ 1.6820e10   ┆ 2.6756e9 │
│ Bretagne                        ┆ 6.0534e9    ┆ 7.1597e9    ┆ 1.1063e9 │
│ Régions 

Crossing data with population data

In [3]:
# Creating population column
df_region = df_region.with_columns(pl.col('BEN_RES_REG').replace_strict(dict_region_population_2023).alias('POPULATION'))

# Computing RAC per person
df_region = df_region.with_columns((pl.col('FLT_PAI_MNT')/pl.col('POPULATION')).alias('PAI_MNT_PERS'))

# Computing the national average per person:
national_average_per_person = df_region['FLT_PAI_MNT'].sum()/df_region['POPULATION'].sum()

# Computing the variation (in percetage) around the national average per person
# Storing in a new column ('VARIATION_AROUND_NAT_AVG')
df_region = df_region.with_columns((100*(pl.col('PAI_MNT_PERS') - national_average_per_person)/national_average_per_person).alias('VARIATION_AROUND_NAT_AVG'))

pl.Config.set_tbl_rows(200)
print(df_region)

shape: (13, 7)
┌───────────────┬─────────────┬─────────────┬──────────┬────────────┬──────────────┬───────────────┐
│ BEN_RES_REG   ┆ FLT_REM_MNT ┆ FLT_PAI_MNT ┆ RAC      ┆ POPULATION ┆ PAI_MNT_PERS ┆ VARIATION_ARO │
│ ---           ┆ ---         ┆ ---         ┆ ---      ┆ ---        ┆ ---          ┆ UND_NAT_AVG   │
│ str           ┆ f64         ┆ f64         ┆ f64      ┆ i64        ┆ f64          ┆ ---           │
│               ┆             ┆             ┆          ┆            ┆              ┆ f64           │
╞═══════════════╪═════════════╪═════════════╪══════════╪════════════╪══════════════╪═══════════════╡
│ Hauts-de-Fran ┆ 1.1957e10   ┆ 1.4140e10   ┆ 2.1830e9 ┆ 5983823    ┆ 2362.973099  ┆ 0.815459      │
│ ce -          ┆             ┆             ┆          ┆            ┆              ┆               │
│ Nord-Pas-de-… ┆             ┆             ┆          ┆            ┆              ┆               │
│ Centre-Val de ┆ 4.6264e9    ┆ 5.6644e9    ┆ 1.0380e9 ┆ 2573295    ┆ 2201.2

Saving data in file

In [4]:
path_to_save = f'{dataset_path}/millman_barometre/page_2'

df_region.write_csv(f'{path_to_save}/2024_region.csv', separator=';')