In [3]:
import traceback
from typing import List, Optional
import logging
from bokeh.io import output_notebook
import polars as pl
from cfa_analysis.data_retrieval import (
    get_country_mapping,
    get_indicators_data,
    get_all_duplicate_dfs,
    get_imf_data_df,
    get_cfa_and_noncfa_data,
)
from cfa_analysis.data_cleanup import (
    find_outliers_IQR,
    clean_up_indicators_dict,
    find_duplicate_indicators,
    merge_duplicate_dfs,
)
from cfa_analysis.analysis import process_single_indicator
from cfa_analysis.data_classes import Indicator
from cfa_analysis.constants import CFA_FRANC_ZONE, WEST_AFRICA, MIDDLE_AFRICA, SKIP_INDICATORS

In [5]:
all_dfs = []
all_countries, countries,  = get_country_mapping()
indicators = clean_up_indicators_dict(get_indicators_data())
duplicate_indicators = find_duplicate_indicators(indicators)
processed_dupes = set() # after processing duplicates, also can use to set indicators in general that I want to skip
for indicator_abbrv in indicators:
    # Wang Jahan ends in 2015 and its an index, requires more research to see if useful
    # ommitting Capital Flows because data appears very incomplete
    if (indicator_abbrv in SKIP_INDICATORS) or (indicator_abbrv in processed_dupes)  or (indicators[indicator_abbrv]['source'] in ['Wang-Jahan Index', 'Capital Flows in Developing Economies']): 
        continue
    indicator_info = indicators.get(indicator_abbrv, {})
    indicator_label = indicator_info.get("label", "")
    indicator_label = indicator_label.strip("\n") if indicator_label is not None else None
    indicator_unit = indicator_info.get("unit", "")
    indicator_unit = indicator_unit.strip("\n") if indicator_unit is not None else None
    indicator = Indicator(indicator_abbrv, indicator_info.get("description", ""), indicator_label, indicator_unit)
    try:
        if (indicator.label, indicator.unit) in duplicate_indicators:
            all_data_df = merge_duplicate_dfs(get_all_duplicate_dfs(duplicate_indicators, indicator.label, indicator.unit, processed_dupes, countries, all_countries), indicator.label)
        else:
            all_data_df = get_imf_data_df(
                get_cfa_and_noncfa_data(
                    indicator.abbrv, 
                    countries, 
                    all_countries
                ), 
                indicator.label
            )    
        all_dfs.append(all_data_df)
    except Exception as e:
        logging.debug(
            f"issue with indicator {indicator.label}, abbrv: {indicator.abbrv}, exception: {e}"
        )

In [34]:
all_dfs[0]

Country,Year,"GDP, Current Prices"
str,i32,f32
"""Benin""",1980,2.302
"""Burkina Faso""",1980,2.386
"""Central Africa…",1980,0.714
"""Côte d'Ivoire""",1980,13.877
"""Cameroon""",1980,8.85
"""Congo, Republi…",1980,2.165
"""Gabon""",1980,4.559
"""Guinea-Bissau""",1980,0.289
"""Equatorial Gui…",1980,0.032
"""Mali""",1980,2.031


In [38]:

indicator_label = 'GDP, Current Prices'

In [1]:

all_dfs[0].group_by(["Year"], maintain_order=True).agg(
    pl.col(indicator_label)
    .where(pl.col("Country").is_in(CFA_FRANC_ZONE))
    .median()
).join(
    all_dfs[0].group_by(["Year"], maintain_order=True).agg(
        pl.col(indicator_label)
        .where(
            pl.col("Country").is_in(WEST_AFRICA)
            | pl.col("Country").is_in(MIDDLE_AFRICA)
        )
        .median()
    ),
    on="Year",
)

    # ).drop_nulls().rename({indicator_label: 'cfa_median', f"{indicator_label}_right": 'noncfa_median'}).with_columns(pl.col("cfa_median").abs().alias('abs_cfa_median'),
    #             pl.col("noncfa_median").abs().alias('abs_noncfa_median')
    #             )

NameError: name 'all_dfs' is not defined

In [31]:
new_all_dfs = []
for df in all_dfs:
    new_all_dfs.append(df.with_columns(
        pl.lit(df.columns[2]).alias(indicator_label)
        pl.col('Country')
            .where(pl.col("Country").is_in(CFA_FRANC_ZONE))
            .median()
    ))

In [33]:
new_all_dfs[0]

Country,Year,"GDP, Current Prices",Fiscal Council Indicator
str,i32,f32,str
"""Benin""",1980,2.302,"""GDP, Current P…"
"""Burkina Faso""",1980,2.386,"""GDP, Current P…"
"""Central Africa…",1980,0.714,"""GDP, Current P…"
"""Côte d'Ivoire""",1980,13.877,"""GDP, Current P…"
"""Cameroon""",1980,8.85,"""GDP, Current P…"
"""Congo, Republi…",1980,2.165,"""GDP, Current P…"
"""Gabon""",1980,4.559,"""GDP, Current P…"
"""Guinea-Bissau""",1980,0.289,"""GDP, Current P…"
"""Equatorial Gui…",1980,0.032,"""GDP, Current P…"
"""Mali""",1980,2.031,"""GDP, Current P…"


In [26]:
all_dfs[0]

Country,Year,"GDP, Current Prices"
str,i32,f32
"""Benin""",1980,2.302
"""Burkina Faso""",1980,2.386
"""Central Africa…",1980,0.714
"""Côte d'Ivoire""",1980,13.877
"""Cameroon""",1980,8.85
"""Congo, Republi…",1980,2.165
"""Gabon""",1980,4.559
"""Guinea-Bissau""",1980,0.289
"""Equatorial Gui…",1980,0.032
"""Mali""",1980,2.031


In [20]:
all_dfs[0]

Country,Year,"GDP, Current Prices"
str,i32,f32
"""Benin""",1980,2.302
"""Burkina Faso""",1980,2.386
"""Central Africa…",1980,0.714
"""Côte d'Ivoire""",1980,13.877
"""Cameroon""",1980,8.85
"""Congo, Republi…",1980,2.165
"""Gabon""",1980,4.559
"""Guinea-Bissau""",1980,0.289
"""Equatorial Gui…",1980,0.032
"""Mali""",1980,2.031


In [None]:
# i'll likely wanna add column economic zone, indicator, and when they joined cfa zone

In [None]:
DataFrame.to_json(lines=True, 