# Transformation des données des tables raw_data vers la table indicateur pour le schéma public

In [None]:
# import des librairies nécessaires pour l'exploration
import pandas as pd
from pandas_profiling import ProfileReport

from src.postgresql_connector import PostgreSQLConnector

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 350)

## Lire des données d'une table

In [None]:
def read_table(table: str, schema = 'public') -> pd.DataFrame:
    pg_connector = PostgreSQLConnector()
    table = pd.read_sql_query(
        sql=f'''
                SELECT *
                FROM {schema}.{table}
                ''',
        con=pg_connector.connection
    )
    return table

In [None]:
def execute_sql(sql_request: str) -> pd.DataFrame:
    pg_connector = PostgreSQLConnector()
    table = pd.read_sql_query(
        sql=sql_request,
        con=pg_connector.connection
    )
    return table

### Lecture des tables 

In [None]:
metadata_chantier = read_table('metadata_chantier', 'raw_data')
fact_progress_chantier = read_table('fact_progress_chantier', 'raw_data')
dim_structures = read_table('dim_structures', 'raw_data')
dim_tree_nodes = read_table('dim_tree_nodes', 'raw_data')

In [None]:
prof = ProfileReport(metadata_chantier)
prof.to_file(output_file='rapport_metadata_chantier.html')

In [None]:
prof = ProfileReport(fact_progress_chantier)
prof.to_file(output_file='rapport_fact_progress_chantier.html')

In [None]:
prof = ProfileReport(dim_structures)
prof.to_file(output_file='rapport_dim_structures.html')

In [None]:
prof = ProfileReport(dim_tree_nodes)
prof.to_file(output_file='rapport_dim_tree_nodes.html')

### Requête SQL

#### Dfakto chantier

Récupération des données des chantiers avec les données issues de dfakto

In [None]:
sql_request_dfakto = f''' 
    SELECT fpc.tree_node_id, 
        fpc.bounded_progress, 
        fpc.progress, 
        dtn.structure_id, 
        dtn.maturity_id, 
        dtn.tree_node_code,
        split_part(dtn.tree_node_code, '-', 1) as code_chantier,
        split_part(dtn.tree_node_code, '-', 2) as code_region, 
        dtn.tree_node_name, 
        dtn.tree_node_status, 
        ds.top_level_id, 
        ds.structure_name, 
        ds.structure_level 
    FROM raw_data.fact_progress_chantier fpc 
        JOIN raw_data.dim_tree_nodes dtn ON fpc.tree_node_id = dtn.tree_node_id 
        JOIN raw_data.dim_structures ds ON dtn.structure_id = ds.structure_id
        WHERE  ds.structure_name IN ('Réforme', 'Région', 'Département')
'''

In [None]:
dfakto_chantiers = execute_sql(sql_request_dfakto)
#dfakto_chantiers = dfakto_chantiers.T.drop_duplicates().T
print(dfakto_chantiers.shape)
print(dfakto_chantiers[['tree_node_code']].value_counts(ascending=True).reset_index(name='count').shape)

In [None]:
dfakto_chantiers.structure_name.value_counts()

In [None]:
dfakto_chantiers.head(200)

#### Public chantier

Dans cette partie, on souhaite construire la table `public.chantier` pour les niveaux : réforme (national), régions et départements

##### Table public.chantier au niveau National 

In [None]:
sql_chantier_national = f'''
    WITH chantier_dfakto AS (
        SELECT fpc.tree_node_id,
            fpc.bounded_progress,
            fpc.progress,
            dtn.structure_id,
            dtn.maturity_id ,
            dtn.tree_node_code, 
            split_part(dtn.tree_node_code, '-', 1) as code_chantier,
            split_part(dtn.tree_node_code, '-', 2) as code_region,
            dtn.tree_node_name, 
            dtn.tree_node_status,
            ds.top_level_id,
            ds.structure_name ,
            ds.structure_level
        FROM raw_data.fact_progress_chantier fpc 
            JOIN raw_data.dim_tree_nodes dtn ON fpc.tree_node_id = dtn.tree_node_id 
            JOIN raw_data.dim_structures ds ON dtn.structure_id = ds.structure_id 
        WHERE ds.structure_name='Réforme'
    )
    SELECT mc.chantier_id as id,
        mc.ch_nom as nom,
        mc.ch_per as ids_perimetre,
        mc."porteur_ids_noDAC" as porteurs_ids,
        mc."porteur_ids_DAC" as porteurs_dac_ids,
        cd.bounded_progress as taux_avancement,
        mz.zone_id as zone_id,
        mz.nom as zone_nom,
        mz.zone_code as code_insee,
        -- debug
        cd.code_chantier,
        cd.code_region
    FROM raw_data.metadata_chantier mc
        LEFT JOIN chantier_dfakto cd ON mc.ch_perseverant = cd.code_region
        JOIN raw_data.metadata_zone mz ON mz.zone_id = 'FRANCE';
'''

In [None]:
chantier = execute_sql(sql_chantier_national)
chantier.shape

In [None]:
chantier.describe(include='all')

In [None]:
print(f'nombre de chantiers perséverants {metadata_chantier.ch_perseverant.count()}')

In [None]:
print(f'nombre de chantiers perséverants au niveau national {chantier.code_chantier.count()}')

C'est le chantier UQP qui est perséverant mais n'a pas de réforme au niveau national

##### Table public.chantier aux niveaux des Régions et Départements