In [58]:
import traceback
from typing import List, Optional
import logging
from bokeh.io import output_notebook
import polars as pl
from cfa_analysis.data_retrieval import (
    get_country_mapping,
    get_indicators_data,
    get_all_duplicate_dfs,
    get_imf_data_df,
    get_cfa_and_noncfa_data,
    get_imf_data_df
)
from cfa_analysis.data_cleanup import (
    find_outliers_IQR,
    clean_up_indicators_dict,
    find_duplicate_indicators,
    merge_duplicate_dfs,
)
from cfa_analysis.analysis import process_single_indicator
from cfa_analysis.data_classes import Indicator
from cfa_analysis.constants import CFA_FRANC_ZONE, WEST_AFRICA, MIDDLE_AFRICA, SKIP_INDICATORS

In [5]:
all_dfs = []
all_countries, countries,  = get_country_mapping()
indicators = clean_up_indicators_dict(get_indicators_data())
duplicate_indicators = find_duplicate_indicators(indicators)
processed_dupes = set() # after processing duplicates, also can use to set indicators in general that I want to skip
for indicator_abbrv in indicators:
    # Wang Jahan ends in 2015 and its an index, requires more research to see if useful
    # ommitting Capital Flows because data appears very incomplete
    if (indicator_abbrv in SKIP_INDICATORS) or (indicator_abbrv in processed_dupes)  or (indicators[indicator_abbrv]['source'] in ['Wang-Jahan Index', 'Capital Flows in Developing Economies']): 
        continue
    indicator_info = indicators.get(indicator_abbrv, {})
    indicator_label = indicator_info.get("label", "")
    indicator_label = indicator_label.strip("\n") if indicator_label is not None else None
    indicator_unit = indicator_info.get("unit", "")
    indicator_unit = indicator_unit.strip("\n") if indicator_unit is not None else None
    indicator = Indicator(indicator_abbrv, indicator_info.get("description", ""), indicator_label, indicator_unit)
    try:
        if (indicator.label, indicator.unit) in duplicate_indicators:
            all_data_df = merge_duplicate_dfs(get_all_duplicate_dfs(duplicate_indicators, indicator.label, indicator.unit, processed_dupes, countries, all_countries), indicator.label)
        else:
            all_data_df = get_imf_data_df(
                get_cfa_and_noncfa_data(
                    indicator.abbrv, 
                    countries, 
                    all_countries
                ), 
                indicator.label
            )    
        all_dfs.append(all_data_df)
    except Exception as e:
        logging.debug(
            f"issue with indicator {indicator.label}, abbrv: {indicator.abbrv}, exception: {e}"
        )

In [34]:
all_dfs[0]

Country,Year,"GDP, Current Prices"
str,i32,f32
"""Benin""",1980,2.302
"""Burkina Faso""",1980,2.386
"""Central Africa…",1980,0.714
"""Côte d'Ivoire""",1980,13.877
"""Cameroon""",1980,8.85
"""Congo, Republi…",1980,2.165
"""Gabon""",1980,4.559
"""Guinea-Bissau""",1980,0.289
"""Equatorial Gui…",1980,0.032
"""Mali""",1980,2.031


In [38]:

indicator_label = 'GDP, Current Prices'

In [1]:

all_dfs[0].group_by(["Year"], maintain_order=True).agg(
    pl.col(indicator_label)
    .where(pl.col("Country").is_in(CFA_FRANC_ZONE))
    .median()
).join(
    all_dfs[0].group_by(["Year"], maintain_order=True).agg(
        pl.col(indicator_label)
        .where(
            pl.col("Country").is_in(WEST_AFRICA)
            | pl.col("Country").is_in(MIDDLE_AFRICA)
        )
        .median()
    ),
    on="Year",
)

    # ).drop_nulls().rename({indicator_label: 'cfa_median', f"{indicator_label}_right": 'noncfa_median'}).with_columns(pl.col("cfa_median").abs().alias('abs_cfa_median'),
    #             pl.col("noncfa_median").abs().alias('abs_noncfa_median')
    #             )

NameError: name 'all_dfs' is not defined

In [2]:
sample_imf_data = {
    "Benin": {
        "1980": 9.3,
        "1981": 1.9,
        "1982": 1.7,
        "1983": -2,
    },
    "Burkina Faso": {
        "1980": 4,
        "1981": 2.7,
        "1982": 1.4,
        "1983": -1.2,
    },
    "Angola": {
        "1980": 2.4,
        "1981": -4.4,
        "1982": 0,
        "1983": 4.2,
    },
    "Congo, Dem. Rep. of the": {
        "1980": 2.4,
        "1981": 0.9,
        "1982": -0.5,
        "1983": 1.4,
    },
    "Cabo Verde": {
        "1980": 5.3,
        "1981": 8.5,
        "1982": 2.8,
        "1983": 9.5,
    },
    "Ghana": {
        "1980": 0.5,
        "1981": -3.8,
        "1982": -8.3,
        "1983": -6.2,
    },
}
indicator_abbrv = "NGDP_RPCH"

In [6]:
df = get_imf_data_df(sample_imf_data, indicator_abbrv)

In [12]:
from polars.testing import assert_frame_equal

In [14]:
df.to_dict(as_series=False)

{'Country': ['Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana',
  'Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana',
  'Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana',
  'Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana'],
 'Year': [1980,
  1980,
  1980,
  1980,
  1980,
  1980,
  1981,
  1981,
  1981,
  1981,
  1981,
  1981,
  1982,
  1982,
  1982,
  1982,
  1982,
  1982,
  1983,
  1983,
  1983,
  1983,
  1983,
  1983],
 'NGDP_RPCH': [9.300000190734863,
  4.0,
  2.4000000953674316,
  2.4000000953674316,
  5.300000190734863,
  0.5,
  1.899999976158142,
  2.700000047683716,
  -4.400000095367432,
  0.8999999761581421,
  8.5,
  -3.799999952316284,
  1.7000000476837158,
  1.399999976158142,
  0.0,
  -0.5,
  2.799999952316284,
  -8.300000190734863,
  -2.0,
  -1.2000000476837158,
  4.199999809265137,
  1.39999997615814

In [23]:
indicators = clean_up_indicators_dict(get_indicators_data())
duplicate_indicators = find_duplicate_indicators(indicators)

In [24]:
duplicate_indicators

{('Real GDP Growth Rate', '% change'): ['NGDP_RPCH', 'NGDP_R_PCH'],
 ('Government Revenue', '% of GDP'): ['rev', 'GGR_G01_GDP_PT'],
 ('Government Expenditure', '% of GDP'): ['exp', 'GGX_GDP']}

In [22]:
assert_frame_equal(df, pl.from_dict(df.to_dict(as_series=False), schema = {
    "Country": pl.Utf8,
    "Year": pl.Int32,
    "NGDP_RPCH" : pl.Float32
}))

In [20]:
df.to_dict(as_series=False)

{'Country': ['Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana',
  'Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana',
  'Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana',
  'Benin',
  'Burkina Faso',
  'Angola',
  'Congo, Dem. Rep. of the',
  'Cabo Verde',
  'Ghana'],
 'Year': [1980,
  1980,
  1980,
  1980,
  1980,
  1980,
  1981,
  1981,
  1981,
  1981,
  1981,
  1981,
  1982,
  1982,
  1982,
  1982,
  1982,
  1982,
  1983,
  1983,
  1983,
  1983,
  1983,
  1983],
 'NGDP_RPCH': [9.300000190734863,
  4.0,
  2.4000000953674316,
  2.4000000953674316,
  5.300000190734863,
  0.5,
  1.899999976158142,
  2.700000047683716,
  -4.400000095367432,
  0.8999999761581421,
  8.5,
  -3.799999952316284,
  1.7000000476837158,
  1.399999976158142,
  0.0,
  -0.5,
  2.799999952316284,
  -8.300000190734863,
  -2.0,
  -1.2000000476837158,
  4.199999809265137,
  1.39999997615814

In [21]:
df == pl.from_dict(df.to_dict(as_series=False), schema = {
    "Country": pl.Utf8,
    "Year": pl.Int32,
    "NGDP_RPCH" : pl.Float32
})

Country,Year,NGDP_RPCH
bool,bool,bool
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True


In [31]:
new_all_dfs = []
for df in all_dfs:
    new_all_dfs.append(df.with_columns(
        pl.lit(df.columns[2]).alias(indicator_label)
        pl.col('Country')
            .where(pl.col("Country").is_in(CFA_FRANC_ZONE))
            .median()
    ))

In [33]:
new_all_dfs[0]

Country,Year,"GDP, Current Prices",Fiscal Council Indicator
str,i32,f32,str
"""Benin""",1980,2.302,"""GDP, Current P…"
"""Burkina Faso""",1980,2.386,"""GDP, Current P…"
"""Central Africa…",1980,0.714,"""GDP, Current P…"
"""Côte d'Ivoire""",1980,13.877,"""GDP, Current P…"
"""Cameroon""",1980,8.85,"""GDP, Current P…"
"""Congo, Republi…",1980,2.165,"""GDP, Current P…"
"""Gabon""",1980,4.559,"""GDP, Current P…"
"""Guinea-Bissau""",1980,0.289,"""GDP, Current P…"
"""Equatorial Gui…",1980,0.032,"""GDP, Current P…"
"""Mali""",1980,2.031,"""GDP, Current P…"


In [26]:
all_dfs[0]

Country,Year,"GDP, Current Prices"
str,i32,f32
"""Benin""",1980,2.302
"""Burkina Faso""",1980,2.386
"""Central Africa…",1980,0.714
"""Côte d'Ivoire""",1980,13.877
"""Cameroon""",1980,8.85
"""Congo, Republi…",1980,2.165
"""Gabon""",1980,4.559
"""Guinea-Bissau""",1980,0.289
"""Equatorial Gui…",1980,0.032
"""Mali""",1980,2.031


In [20]:
all_dfs[0]

Country,Year,"GDP, Current Prices"
str,i32,f32
"""Benin""",1980,2.302
"""Burkina Faso""",1980,2.386
"""Central Africa…",1980,0.714
"""Côte d'Ivoire""",1980,13.877
"""Cameroon""",1980,8.85
"""Congo, Republi…",1980,2.165
"""Gabon""",1980,4.559
"""Guinea-Bissau""",1980,0.289
"""Equatorial Gui…",1980,0.032
"""Mali""",1980,2.031


In [None]:
# i'll likely wanna add column economic zone, indicator, and when they joined cfa zone

In [None]:
DataFrame.to_json(lines=True, 

In [25]:
import requests

In [48]:
response = requests.get('https://www.imf.org/external/datamapper/api/v1/GGXCNL_NGDP/Peee')

In [57]:
indicators = requests.get("https://www.imf.org/external/datamapper/api/v1/indicators").json()['indicators']

In [59]:
indicators = clean_up_indicators_dict(indicators)

In [61]:
indicators

{'NGDP_RPCH': {'label': 'Real GDP Growth Rate',
  'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at constant prices of final goods and services produced within a country during a specified time period, such as one year.",
  'source': 'World Economic Outlook (October 2023)',
  'unit': '% change',
  'dataset': 'WEO'},
 'NGDPD': {'label': 'GDP, Current Prices',
  'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at current prices of final goods and services produced within a country during a specified time period, such as one year.",
  'source': 'World Economic Outlook (October 2023)',
  'unit': 'Billions of U.S. dollars',
  'dataset': 'WEO'},
 'NGDPDPC': {'label': 'GDP Per Capita, Current Prices',
  'description': "Gross domestic product is the most commonly used single measure of a c

In [60]:
find_duplicate_indicators(indicators)

{('Real GDP Growth Rate', '% change'): ['NGDP_RPCH', 'NGDP_R_PCH'],
 ('Government Revenue', '% of GDP'): ['rev', 'GGR_G01_GDP_PT'],
 ('Government Expenditure', '% of GDP'): ['exp', 'GGX_GDP']}

In [47]:
response.json()

{'values': {'GGXCNL_NGDP': {'USA': {'2001': -0.5,
    '2002': -3.8,
    '2003': -4.8,
    '2004': -4.2,
    '2005': -3.1,
    '2006': -2,
    '2007': -2.9,
    '2008': -6.6,
    '2009': -13.2,
    '2010': -11,
    '2011': -9.7,
    '2012': -8.1,
    '2013': -4.5,
    '2014': -4,
    '2015': -3.5,
    '2016': -4.4,
    '2017': -4.8,
    '2018': -5.3,
    '2019': -5.7,
    '2020': -14,
    '2021': -11.6,
    '2022': -3.7,
    '2023': -8.2,
    '2024': -7.4,
    '2025': -7.4,
    '2026': -7,
    '2027': -6.7,
    '2028': -7}}},
 'api': {'version': '1', 'output-method': 'json'}}