# I have extracted the data for Q2 and Q3 2025

I want to have a look at the extracted data.
- I want to see that the processing makes sense, that the calculations of the taux, ecart and subsidies make sense.
- I want to have a look at weird dec/val values that maybe I want to clean
- I want to have a look at how good the dec/val values are:
    - Do we have enough data for Q2 and Q3?
    - Is what the centers are reporting consistent?
    - Are the dec and val values very different?

## 1: Initialization
Read the files

In [None]:
import polars as pl

In [105]:
not_cleaned = pl.read_csv(
    "/home/jovyan/workspace/pipelines/initialize_vbr/data/quantity_data/not_cleaned_11_17_cleaned.csv"
)
cleaned = pl.read_csv(
    "/home/jovyan/workspace/pipelines/initialize_vbr/data/quantity_data/cleaned_11_17_cleaned.csv"
)

## Check the calculations

I had a look and it looks nice

In [106]:
not_cleaned.head()

ou,month,service,dec,val,tarif,payment_mode,contract_start_date,contract_end_date,level_1_name,level_2_name,level_3_name,level_4_name,level_5_name,level_6_name,level_7_name,level_1_uid,level_2_uid,level_3_uid,level_4_uid,level_5_uid,level_6_uid,level_7_uid,level,taux_validation,ecart_dec_val,gain_verif,subside_sans_verification,subside_avec_verification,quarter
str,str,str,f64,f64,f64,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,f64,f64,f64,f64,str
"""K8OzbPQDUwM""","""2025Q2""","""19.- PMA - Nouvelles ou ancie…",5.0,5.0,300.0,"""paiement_pma""",202504,202612,"""Mali""","""Ségou""","""District Markala""","""Dlonkèbougou""","""CSCom Dlonkèbougou""",,,"""cKcuEmNzNF2""","""w9OYIUXaZuO""","""cQI1f7VjjqU""","""gkew0Rv0a5S""","""K8OzbPQDUwM""",,,5,1.0,0.0,0.0,1500.0,1500.0,"""2025Q2"""
"""qJzsim9hrUU""","""2025Q2""","""15.- PMA - Nbre de Femmes aya…",15.0,14.0,225.0,"""paiement_pma""",202504,202612,"""Mali""","""Sikasso""","""District Sikasso""","""Kaboila""","""CSCom de Kaboila""",,,"""cKcuEmNzNF2""","""YT0vnpwoUTV""","""WRcK01zDuy9""","""Lt9QHQkqx9y""","""qJzsim9hrUU""",,,5,0.933333,0.066667,225.0,3375.0,3150.0,"""2025Q2"""
"""R5ZEnH3PdFO""","""2025Q2""","""11.- PMA - Nbre d'accouchemen…",51.0,51.0,600.0,"""paiement_pma""",202504,202612,"""Mali""","""Koulikoro""","""District Banamba""","""Kondo""","""Cscom Kondo""",,,"""cKcuEmNzNF2""","""GTkaSxaspwx""","""Z2xvsAt2o5a""","""h6Daaxs0D51""","""R5ZEnH3PdFO""",,,5,1.0,0.0,0.0,30600.0,30600.0,"""2025Q2"""
"""sqjTIjUFWhQ""","""2025Q2""","""17.- PMA - Nbre de Nouvelles …",9.0,5.0,600.0,"""paiement_pma""",202504,202612,"""Mali""","""Mopti""","""District Douentza""","""Mondoro""","""CSCom Mondoro""",,,"""cKcuEmNzNF2""","""clmk5ZDaCyP""","""E62kOlA1JXK""","""ZjNy0DsHDWS""","""sqjTIjUFWhQ""",,,5,0.555556,0.444444,2400.0,5400.0,3000.0,"""2025Q2"""
"""mipdnQbWyl5""","""2025Q2""","""22.- PMA - Nbre de victimes de…",196.0,0.0,900.0,"""paiement_pma""",202504,202612,"""Mali""","""Ségou""","""District Baroueli""","""Baroueli Central""","""CSCom Baroueli central""",,,"""cKcuEmNzNF2""","""w9OYIUXaZuO""","""lTZ4jOg22OM""","""s97kRyeWttQ""","""mipdnQbWyl5""",,,5,0.0,1.0,176400.0,176400.0,0.0,"""2025Q2"""


## See weird values

In general,
- There are not many rows where the validated is a lot bigger than the declared 
- There are not many rows that have 0 declared and non-zero validated

But,
- There are a lot of rows with 0 validated... And I struggle to see if this is because we did not collect the data or because indeed there were 0 validated.

In [110]:
distributions = not_cleaned.select(
    [
        (((pl.col("dec") == pl.col("val")) & (pl.col("val") != 0)).mean() * 100).alias(
            "pct_all_validated"
        ),
        (((pl.col("val") == 0) & (pl.col("dec") != 0)).mean() * 100).alias("pct_none_validated"),
        (((pl.col("val") != 0) & (pl.col("dec") == 0)).mean() * 100).alias("pct_weird_0_dec"),
        ((pl.col("taux_validation") > 2).mean() * 100).alias("pct_weirdly_high_validated"),
    ]
)
not_cleaned = not_cleaned.with_columns(
    pl.when((pl.col("dec") == pl.col("val")) & (pl.col("val") != 0))
    .then(pl.lit("all_validated"))
    .when((pl.col("val") == 0) & (pl.col("dec") != 0))
    .then(pl.lit("none_validated"))
    .when((pl.col("val") != 0) & (pl.col("dec") == 0))
    .then(pl.lit("weird_0_dec"))
    .when(pl.col("taux_validation") > 2)
    .then(pl.lit("weirdly_high_validated"))
    .otherwise(pl.lit("not_classified"))
    .alias("classification")
)
display(distributions)

pct_all_validated,pct_none_validated,pct_weird_0_dec,pct_weirdly_high_validated
f64,f64,f64,f64
50.513602,20.490897,0.616997,0.560446


## Check if we have enough data
- The number of rows per payment mode has not changed much between Q2 and Q3
- The number of centers that have data in Q3 but not in Q2 is small
- The number of centers that have data in Q2 but not in Q3 is small
- The services do not change a lot...
    - There are some ous that use to have a fair amount of services and now have one -- that is a bit odd, but it does not seem to affect that many of them

In [116]:
rows_per_payment = (
    not_cleaned.group_by(["payment_mode", "month"])
    .agg(pl.len().alias("rows"))
    .pivot(index="payment_mode", on="month", values="rows")
)
rows_per_payment = rows_per_payment.with_columns(
    ((pl.col("2025Q3") - pl.col("2025Q2")) / pl.col("2025Q2") * 100).alias("pct_change")
)
rows_per_payment.sort("pct_change", descending=True)

payment_mode,2025Q2,2025Q3,pct_change
str,u32,u32,f64
"""paiement_infsco""",79,14,5436700000.0
"""paiement_prive""",764,911,19.240838
"""paiement_cabsf""",420,492,17.142857
"""paiement_cabmed""",1860,2095,12.634409
"""paiement_cabinf""",287,320,11.498258
"""paiement_site""",23458,25529,8.828545
"""paiement_maternite""",6697,7268,8.526206
"""paiement_pma""",22496,23470,4.329659
"""paiement_eph""",130,133,2.307692
"""paiement_pca""",1024,1030,0.5859375


In [123]:
ous_in_months = (
    not_cleaned.group_by(["ou", "month"])
    .agg(pl.len().alias("rows"))
    .pivot(index="ou", on="month", values="rows")
)
ous_in_months = ous_in_months.with_columns(
    pl.when(pl.col("2025Q2").is_null())
    .then(pl.lit("2025Q3"))
    .when(pl.col("2025Q3").is_null())
    .then(pl.lit("2025Q2"))
    .otherwise(pl.lit("both_months"))
    .alias("data_available")
)
ous_in_months["data_available"].value_counts(normalize=True)

data_available,proportion
str,f64
"""2025Q3""",0.040194
"""both_months""",0.924809
"""2025Q2""",0.034997


In [None]:
both_ous = ous_in_months.filter(pl.col("data_available") == "both_months")
both_ous = both_ous.with_columns(
    [pl.col("2025Q2").cast(pl.Float64), pl.col("2025Q3").cast(pl.Float64)]
)
both_ous = both_ous.with_columns(
    ((pl.col("2025Q3") - pl.col("2025Q2")) / pl.col("2025Q2") * 100).alias("pct_change"),
    (abs(pl.col("2025Q3") - pl.col("2025Q2"))).alias("abs_change"),
)
both_ous.sort("pct_change", descending=True)
summary_changes = both_ous.select(
    [
        pl.col("pct_change").min().alias("min_pct_change"),
        pl.col("pct_change").max().alias("max_pct_change"),
        pl.col("pct_change").mean().alias("avg_pct_change"),
        pl.col("pct_change").median().alias("median_pct_change"),
        pl.col("pct_change").quantile(0.75).alias("75th_pct_change"),
        pl.col("abs_change").min().alias("min_abs_change"),
        pl.col("abs_change").max().alias("max_abs_change"),
        pl.col("abs_change").mean().alias("avg_abs_change"),
        pl.col("abs_change").median().alias("median_abs_change"),
        pl.col("abs_change").quantile(0.75).alias("75th_pct_abs_change"),
    ]
)
display(summary_changes)

min_pct_change,max_pct_change,avg_pct_change,median_pct_change,75th_pct_change,min_abs_change,max_abs_change,avg_abs_change,median_abs_change,75th_pct_abs_change
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-88.888889,1800.0,19.717445,0.0,25.0,0.0,18.0,1.906894,1.0,3.0


## Check how good is the data
Just like before, the data is okey,
except for the big amount of rows that have 0 validated

This percetange goes from 20 to 15 if we ignore the rows that have validated null, but it is still fairly high.
I am not sure how much this will distort the election -- we should probably test it once we can.

In [138]:
taux_not_cleaned = not_cleaned.select(
    [
        pl.col("taux_validation").mean().alias("taux_mean"),
        pl.col("taux_validation").min().alias("taux_min"),
        pl.col("taux_validation").max().alias("taux_max"),
        pl.col("taux_validation").median().alias("taux_median"),
        pl.col("taux_validation").quantile(0.25).alias("taux_q25"),
        pl.col("taux_validation").quantile(0.33).alias("taux_q33"),
        pl.col("taux_validation").quantile(0.75).alias("taux_q75"),
        ((pl.col("taux_validation") > 2).mean() * 100).alias("pct_weirdly_high_validated"),
        ((pl.col("taux_validation") == 1).mean() * 100).alias("pct_all_validated"),
        ((pl.col("taux_validation") == 0).mean() * 100).alias("pct_none_validated"),
    ]
)
taux_cleaned = cleaned.select(
    [
        pl.col("taux_validation").mean().alias("taux_mean"),
        pl.col("taux_validation").min().alias("taux_min"),
        pl.col("taux_validation").max().alias("taux_max"),
        pl.col("taux_validation").median().alias("taux_median"),
        pl.col("taux_validation").quantile(0.25).alias("taux_q25"),
        pl.col("taux_validation").quantile(0.33).alias("taux_q33"),
        pl.col("taux_validation").quantile(0.75).alias("taux_q75"),
        ((pl.col("taux_validation") > 2).mean() * 100).alias("pct_weirdly_high_validated"),
        ((pl.col("taux_validation") == 1).mean() * 100).alias("pct_all_validated"),
        ((pl.col("taux_validation") == 0).mean() * 100).alias("pct_none_validated"),
    ]
)
taux = pl.concat(
    [
        taux_not_cleaned.with_columns(pl.lit("not_cleaned").alias("dataset")),
        taux_cleaned.with_columns(pl.lit("cleaned").alias("dataset")),
    ]
)
display(taux)

taux_mean,taux_min,taux_max,taux_median,taux_q25,taux_q33,taux_q75,pct_weirdly_high_validated,pct_all_validated,pct_none_validated,dataset
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.77679,0.0,282.0,1.0,0.506329,0.935484,1.0,0.560446,51.130599,20.490897,"""not_cleaned"""
0.803385,0.0,3.0,1.0,0.906977,0.958333,1.0,0.323802,54.135871,15.978401,"""cleaned"""
