In [1]:
import pandas as pd
import numpy as np
from openhexa.sdk import workspace
from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
data_path = '/home/hexa/workspace/PBF burundi extraction/data/'
tqdm.pandas()
from os import listdir, environ
from os.path import isfile, join
from sqlalchemy import create_engine

In [8]:
def valid_simulation_name(f):
    return "Quality_risk" in f and "p_high" in f and "Paiement" in f and "GAIN_VERIF_MEDIAN_MAX" in f and "MIN_NB_TRIM_OBS"in f and "MIN_NB_TRIM_WITH_VERIF" in f and "p_low"in f and "p_mod" in f and "cout_verif" in f and "seuil_max_moyen_risk" in f and "seuil_max_bas_risk" in f
def get_parameters(f):
    dict = {"p_high" : [], "Quality_risk" : [], "Paiement": [], "GAIN_VERIF_MEDIAN_MAX" : [], "MIN_NB_TRIM_OBS" :[], "MIN_NB_TRIM_WITH_VERIF" : [], "p_low":[], "p_mod" : [], "seuil_max_bas_risk" : [], "seuil_max_moyen_risk" : [], "cout_verif" : []}
    dict["name"] = f
    dict["model"] = ["defaut"]
    params = f.split('-')
    for p in dict:
        for p_match in params:
            if p in p_match:
                if  "model" in p:
                    dict["model"] = [p_match.split("___")[1]]
                else:
                    dict.setdefault(p, []).append(p_match.split('___')[1].replace('.csv',''))
    return pd.DataFrame.from_dict(dict)
def get_statistics(mypath, f):
    df = pd.read_csv(join(mypath, f))
    df["name"] = f
    return df


def str_to_date(datestr):
    if isinstance(datestr, str) and "Q" in datestr:
        return datestr[:4] +'-'+str(int(datestr[5])*3)+'-1'
    elif isinstance(datestr, int) or datestr.isdigit():
        x = int(datestr)
        return f"{x//100}-{x%100}-1"

In [9]:
#parameters
base_path = f"{workspace.files_path}/PBF burundi extraction/data/"
DB_names = {f"{base_path}result_simulation": "VBR_results", f"{base_path}Selections_Verif" : "VBR_liste_detaillees"}
for mypath in [f"{base_path}result_simulation", f"{base_path}Selections_Verif"]:
    file_results = [f for f in listdir(mypath) if isfile(join(mypath, f)) and valid_simulation_name(f)]
    
    dfs = pd.DataFrame()
    dfs_detailled = pd.DataFrame()
    for f in file_results:
        df = get_parameters(f).merge(get_statistics(mypath, f), on = "name")
        if "Selections" in mypath:
            dfs_detailled = pd.concat([dfs_detailled,  df],ignore_index = True)
            
            df["nb_centers_verified"] = df["verified"].map(lambda x: 1 if x else 0)
            df["nb_centers"] = 1
        
            df["#_scores_risque_eleve"] = df['categorie_risque'].map(lambda x: 1 if x == 'high' or x == 'uneligible' else 0)
            df["#_scores_risque_mod1"] = df['categorie_risque'].map(lambda x: 1 if x == 'moderate_1' else 0)
            df["#_scores_risque_mod2"] = df['categorie_risque'].map(lambda x: 1 if x == 'moderate_2' else 0)
            df["#_scores_risque_mod3"] = df['categorie_risque'].map(lambda x: 1 if x == 'moderate_3' else 0)
            df["#_scores_risque_faible"] = df['categorie_risque'].map(lambda x: 1 if x == 'low' else 0)
            df = df.groupby(["period","model","p_high", "p_mod", "p_low", "MIN_NB_TRIM_WITH_VERIF", "MIN_NB_TRIM_OBS", "GAIN_VERIF_MEDIAN_MAX", "Paiement", "name","level_2_name","level_3_name"],as_index=False)[["nb_centers_verified","#_scores_risque_faible","#_scores_risque_mod1","#_scores_risque_mod2","#_scores_risque_mod3","#_scores_risque_eleve"]].sum()
        dfs =pd.concat([dfs, df], ignore_index = True)
    
    dfs.rename(columns = {"level_2_name":"province","period":"periode"},inplace= True)
    dfs = dfs.sort_values(["province","periode"]).drop('name', axis=1)
    engine = create_engine(environ["WORKSPACE_DATABASE_URL"])
    if "result_simulation"in mypath:
        print(dfs.columns)
        dfs["gain_vbr"] = dfs['cout total (syst)'] - dfs["cout total (VBR)"]
    dfs.to_csv(f"{data_path}/{DB_names[mypath]}.csv")
    dfs.to_sql(DB_names[mypath],con=engine, if_exists="replace")
    if "Selections" in mypath:
        dfs_detailled.verified = dfs_detailled.verified.astype(int)
        dfs_detailled.rename(columns = {"level_2_name":"province","period":"periode"},inplace= True)
        dfs_detailled["date"] = dfs_detailled["periode"].map(str_to_date)
        dfs_detailled = dfs_detailled.sort_values(["province","periode"]).drop('name', axis=1)
        dfs_detailled.to_sql("VBR_liste_verification",con=engine, if_exists="replace")
        dfs_detailled.to_csv(f"{data_path}/VBR_liste_verification.csv")

Index(['p_high', 'Quality_risk', 'Paiement', 'GAIN_VERIF_MEDIAN_MAX',
       'MIN_NB_TRIM_OBS', 'MIN_NB_TRIM_WITH_VERIF', 'p_low', 'p_mod',
       'seuil_max_bas_risk', 'seuil_max_moyen_risk', 'cout_verif', 'model',
       'province', 'periode', '#centres', '#risque élevé', '#risque modéré',
       '#risque faible', '# vérifiés', 'gain moyen', 'taux validation moyen',
       'cout vérif (VBR)', 'cout vérif (syst)', 'subsides santé (VBR)',
       'subsides santé (syst)', 'cout total (VBR)', 'cout total (syst)',
       'cout verif sur cout total (VBR)', 'cout verif sur cout total (syst)',
       '#centres lese par taux validation',
       '#centres favorise par taux validation', 'Total subsides sous évalués',
       'Total subsides sur-évalués', 'perte médiane pour centres non-vérifiés',
       'gain médian pour centres vérifiés',
       '#_scores_qualite_risqués (centres vérifiés)',
       '#_scores_qualite_risqués (centres non-vérifiés)',
       '#_scores_qualite_non-risqués (centres v

In [4]:
dfs_detailled.columns

Index(['p_high', 'Quality_risk', 'Paiement', 'GAIN_VERIF_MEDIAN_MAX',
       'MIN_NB_TRIM_OBS', 'MIN_NB_TRIM_WITH_VERIF', 'p_low', 'p_mod',
       'seuil_max_bas_risk', 'seuil_max_moyen_risk', 'cout_verif', 'model',
       'periode', 'ou', 'level_2_uid', 'province', 'level_3_uid',
       'level_3_name', 'level_4_uid', 'level_4_name', 'level_5_uid',
       'level_5_name', 'level_6_uid', 'level_6_name', 'verified',
       'gain_verif_median_precedent', 'gain_verif_actuel',
       'benefice_net_verification', 'gain_perte_subside_taux_val',
       'taux_validation', 'subside_dec_period_verif',
       'subside_val_period_verif', 'subside_period_verif', 'ecart_median',
       'categorie_risque', 'indicateurs_qualite_risque_eleve',
       'indicateurs_qualite_risque_mod', 'indicateurs_qualite_risque_faible',
       'Administration du centre de sante', 'Bloc opératoire et Chirurgie',
       'Consultations Externes et Urgences',
       'Gestion des médicaments et produits pharmaceutiques',
  

In [5]:
dfs.columns

Index(['periode', 'model', 'p_high', 'p_mod', 'p_low',
       'MIN_NB_TRIM_WITH_VERIF', 'MIN_NB_TRIM_OBS', 'GAIN_VERIF_MEDIAN_MAX',
       'Paiement', 'province', 'level_3_name', 'nb_centers_verified',
       '#_scores_risque_faible', '#_scores_risque_mod1',
       '#_scores_risque_mod2', '#_scores_risque_mod3',
       '#_scores_risque_eleve'],
      dtype='object')