In [22]:
import pandas as pd
import polars as pl
from IPython.display import display
from bokeh.plotting import show
from bokeh.io import output_notebook
from cfa_analysis.data_retrieval import (
    get_data_from_imf,
    rename_from_abbr_to_full_name,
    get_all_metric_data,
    get_cfa_and_noncfa_data,
    get_country_mapping,
    get_indicators_data,
    get_imf_data_df,
)
from cfa_analysis.constants import CFA_FRANC_ZONE, WEST_AFRICA, MIDDLE_AFRICA
from cfa_analysis.data_cleanup import remove_outliers, find_outliers_IQR
from cfa_analysis.presentation import generate_graph, chat_gpt_analyze_results
from cfa_analysis.analysis import analyze_medians
import panel as pn

%load_ext jupyter_ai_magics
output_notebook()
pn.extension()

The jupyter_ai_magics extension is already loaded. To reload it, use:
  %reload_ext jupyter_ai_magics


In [None]:
""" 
TODO:

PRIORITY:
- correct for when CFA countries joined economic union, requires correct analyze functions
- indicator on graph, telling user about negative numbers formatting
- fix issue where some chatgpt formatting doesn't respect the two sections
- see if we can flip the chart for negative numbers
- try chat gpt model

- make graph bigger?  
- how did I determine the african regions? 
- real long term government bond yield the chatgpt explanation is broken thinks both
- Direct Investment Abroad - chatgpt reading it as none
- to add zeros to the graph can use patch https://github.com/bokeh/bokeh/issues/6536 


SECONDARY:
- consider percentiles, could do a box plot
- pre 2000 how much data do we actually have? 

ELSE: 
- graph the confidence intervals - MAKES NO SENSE - but break that out into another notebook and save it
- break out the cfa into the two economic unions?
- make question of the analysis more explicit
- save the results in structure, in case I want to make changes. 
- set up drone
- consider making the graph wider 
- review prompt from gpt4 about analyzing results - might want to graph confidence intervals https://stackoverflow.com/questions/71916767/how-to-plot-confidence-interval-of-a-time-series-data-in-python 
- https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwjSz5mNp5qCAxW8FFkFHRYyC9AQFnoECBgQAQ&url=https%3A%2F%2Fwww.elibrary.imf.org%2Fdownloadpdf%2Fbook%2F9781589066755%2Fch005.pdf&usg=AOvVaw2_0dmxOSG60i6sJfHjELfI&opi=89978449 check out for ideas
"""

In [None]:
"""
PROMPT:
- maybe take all of chat gpts conclusion and try to create an overall summary ?, also the unit should be provided in the explaination of the metric 
"""

In [None]:
"""
NOTES:
- will not remove outliers for cfa franc zone because each period only has 14 data points 
- some contention with what chatgpt considers positive negative indicators
- skipping analysis if data isn't recent 
- changed analysis to be only middle and western africa
- no log scale value for zero
- Comoros is in the cfa zone, but from eastern africa zone so not in this analysis
"""

In [None]:
# from gtp4
"""To understand the economic standing of different countries across various economic zones, statistical indicators such as median and standard deviation are commonly used. However, it can be beneficial to consider a broader array of statistical measures that provide additional insight, such as the mean, percentile, skewness, and kurtosis.
Median

Definition:

The median is the middle point of a number set, in which half the numbers are above the median and half are below.

Advantages:

    Robust to outliers: The median is not heavily influenced by outliers, providing a more reliable measure for highly skewed distributions.
    Typical values representation: The median provides a representative value for 'typical' countries within an economic zone.

Standard Deviation (std)

Definition:

The standard deviation is a measure of the amount of variation or dispersion in a data set.

Advantages:

    Measure of variability: It provides insight into the spread or distribution of the data. A low standard deviation indicates that data points tend to be closer to the mean, whereas a high standard deviation shows greater variability.
    Insights into volatility: A high standard deviation could indicate potential instability within an economic zone.

Mean (Average)

Definition:

The mean is the sum of all data divided by the number of data points. It serves as a measure of central tendency.

Advantages:

    Summarizes data with a single number: It gives a general idea of the data distribution.
    Basis for other statistical techniques: Many other statistics and parameters are based on the mean.

Percentile

Definition:

The percentile is a statistical measure that indicates the value below which a given percentage of observations falls.

Advantages:

    Fixed proportion representation: You can compare across economic zones using a fixed proportion of countries, e.g., top 10% to identify the best-performing or bottom 10% for the worst-performing.
    Breakdown of data spread: It provides insight into the spread of data beyond central tendencies.

Skewness and Kurtosis

Definition:

Skewness measures the asymmetry of a distribution, while kurtosis measures the "tailedness" of a distribution.

Advantages:

    Insights into data distribution: They provide additional information about the shape of the data distribution, which can highlight outliers or concentration of data.

In context, the median can provide a robust baseline per economic zone but to assess the variability or volatility within the zone, using the standard deviation would be important. Mean, percentile, skewness, and kurtosis could uncover additional valuable information about the performance of the countries within and across zones. Remember, these should be used in combination with appropriate economic measures, such as Purchasing Power Parity (PPP) adjusted GDP, to account for cost of living differences among countries.
r capita, inflation rate, unemployment rate etc., to foster a more comprehensive analysis of the economic zones.
"""

France's colonies gained nominal independence, most in about 1960, but as Charles DeGaulle phrased it, "foreign policy, defence, the currency, economic and financial policy, raw materials, the control of justice, higher education and distant communications will constitute a common domain" 

The CFA franc zone consists of 14 countries in sub-Saharan Africa, each affiliated with one of two monetary unions. Benin, Burkina Faso, Côte D’Ivoire, Guinea-Bissau, Mali, Niger, Senegal, and Togo comprise the West African Economic and Monetary Union, or WAEMU, founded in 1994 to build on the foundation of the West African Monetary Union, founded in 1973. The remaining six countries — Cameroon, Central African Republic, Chad, Republic of Congo, Equatorial Guinea, and Gabon — comprise the Central African Economic and Monetary Union, or CAEMC. 

In [6]:
%env OPENAI_API_KEY=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

env: OPENAI_API_KEY=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


In [7]:
%ai list openai-chat

| Provider | Environment variable | Set? | Models |
|----------|----------------------|------|--------|
| `openai-chat` | `OPENAI_API_KEY` | <abbr title="You have set this environment variable, so you can use this provider's models.">✅</abbr> | `openai-chat:gpt-3.5-turbo`, `openai-chat:gpt-3.5-turbo-16k`, `openai-chat:gpt-3.5-turbo-0301`, `openai-chat:gpt-3.5-turbo-0613`, `openai-chat:gpt-3.5-turbo-16k-0613`, `openai-chat:gpt-4`, `openai-chat:gpt-4-0314`, `openai-chat:gpt-4-0613`, `openai-chat:gpt-4-32k`, `openai-chat:gpt-4-32k-0314`, `openai-chat:gpt-4-32k-0613` |


In [16]:
%%ai openai-chat:gpt-4 -r 
reset the chat history

In [None]:
%%ai openai-chat:gpt-4 -f markdown

In [10]:
all_countries, countries = get_country_mapping()

In [8]:
indicator_abbrv = "GGXCNL_NGDP"

In [13]:
indicators = get_indicators_data()

In [74]:
indicators

{'NGDP_RPCH': {'label': 'Real GDP growth',
  'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at constant prices of final goods and services produced within a country during a specified time period, such as one year.",
  'source': 'World Economic Outlook (October 2023)',
  'unit': 'Annual percent change',
  'dataset': 'WEO'},
 'NGDPD': {'label': 'GDP, current prices',
  'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at current prices of final goods and services produced within a country during a specified time period, such as one year.",
  'source': 'World Economic Outlook (October 2023)',
  'unit': 'Billions of U.S. dollars',
  'dataset': 'WEO'},
 'NGDPDPC': {'label': 'GDP per capita, current prices\n',
  'description': "Gross domestic product is the most commonly used single meas

In [None]:
'': {'label': None,
  'description': None,
  'source': None,
  'unit': None,
  'dataset': None},

In [71]:
test = indicators[""].get("label", "").strip("\n")

In [72]:
test

In [14]:
label = indicators[indicator_abbrv]

In [16]:
label = label["label"].strip("\n")

In [11]:
all_data_dict = get_cfa_and_noncfa_data(indicator_abbrv, countries, all_countries)

In [19]:
df = get_imf_data_df(all_data_dict, label)

In [23]:
pl.Config.set_tbl_rows(-1)

polars.config.Config

In [66]:
df.select(pl.col("GDP per capita, current prices").is_null())

"GDP per capita, current prices"
bool
False
False
False
False
False
False
False
False
False
False


In [49]:
df.select(pl.col("GDP per capita, current prices").is_null())  # nan != null

"GDP per capita, current prices"
bool
False
False
False
False
False
False
False
False
False
False


In [41]:
df.with_columns(pl.all().is_null().count())

Country,Year,"GDP per capita, current prices"
u32,u32,u32
1100,1100,1100
1100,1100,1100
1100,1100,1100
1100,1100,1100
1100,1100,1100
1100,1100,1100
1100,1100,1100
1100,1100,1100
1100,1100,1100
1100,1100,1100


In [30]:
df.select(pl.all().len())

Country,Year,"GDP per capita, current prices"
u32,u32,u32
1100,1100,1100


In [23]:
df = pd.DataFrame(
    {
        "2026": {
            "Benin": 15.957120583638,
            "Burkina Faso": 21.751984021882,
            "Cameroon": 15.276436727498,
            "Central African Republic": 17.813262271024,
            "Chad": 18.122023046887,
            "Congo, Republic of ": 24.767128503001,
            "Côte d'Ivoire": 17.964340458605,
            "Equatorial Guinea": 16.816999590326,
            "Gabon": 17.833734415904,
            "Guinea-Bissau": 16.611017092856,
            "Mali": 22.483689583185,
            "Niger": 19.70280766226,
            "Senegal": 23.281517749846,
            "Togo": 18.326945016511,
        },
        "2027": {
            "Benin": 16.457120583638,
            "Burkina Faso": 22.232385243724,
            "Cameroon": 15.281385158321,
            "Central African Republic": 17.730243113895,
            "Chad": 18.738886780455,
            "Congo, Republic of ": 24.549433383639,
            "Côte d'Ivoire": 17.937017341522,
            "Equatorial Guinea": 16.25586719744,
            "Gabon": 17.732059808203,
            "Guinea-Bissau": 16.928003342113,
            "Mali": 22.707552862806,
            "Niger": 19.752482712501,
            "Senegal": 23.532366053973,
            "Togo": 18.826570848518,
        },
        "2028": {
            "Benin": 16.857120583638,
            "Burkina Faso": 22.596274439084,
            "Cameroon": 15.346160903338,
            "Central African Republic": 17.681195940774,
            "Chad": 17.729688314162,
            "Congo, Republic of ": 24.052521335298,
            "Côte d'Ivoire": 17.956271531288,
            "Equatorial Guinea": 15.636264329902,
            "Gabon": 17.623441793848,
            "Guinea-Bissau": 17.045901243232,
            "Mali": 22.964548671718,
            "Niger": 19.757142068433,
            "Senegal": 23.343598955458,
            "Togo": 19.283118666619,
        },
    }
)

In [100]:
%%ai openai-chat:gpt-4 -f markdown

if i use the median to calculate confidence intervals should I remove outliers from the data? or is it fine to leave the outliers?

# Handling Outliers in Confidence Interval Calculation

When it comes to whether or not outliers should be removed before calculating the confidence intervals, it depends on the specific circumstances and objectives of the analysis. 

## Median and Outliers

One advantage of median over mean is that it's not influenced by outliers. In other words, the median represents the center of your data without considering extreme values.

However, outliers can still influence the calculation of confidence intervals. This is because confidence intervals are based on distribution assumptions, and outliers can violate these assumptions, particularly if they are the result of errors or anomalies.

## Should Outliers Be Removed?

Removing outliers before calculating confidence intervals can sometimes be the right choice, but you should carefully consider why the outliers are present in the first place.

**1. Outliers are Errors:**
If the outliers are clearly erroneous (e.g. negative GDP when that's not possible, or GDP values that are orders of magnitude higher than expected), it may be appropriate to remove them before calculating confidence intervals.

**2. Outliers are Real, but Anomalous:**
If outliers are correct but represent anomalous or non-representative events (e.g. a recession or boom year), it may be appropriate to remove them depending on whether you are interested in the general trend or unusual variations. If analyzing the general trend, such outliers may be removed. 

**3. Outliers are Highly Uncertain:**
If outliers are highly uncertain but might be indicative of a new or infrequent event (e.g. massive economic growth in a previously low-GDP country), it might be important to leave them in. Removing these outliers could lead to an underestimation of the variability or changes in the data.

## Bottom Line

Before deciding to remove outliers, you need to understand why they exist. Removing outliers without good reason can distort the real picture that data provides. At the end of the day, this decision should be based on a balance between the statistical rationale and comprehensive understanding of the nature of the data and the domain specifics.

In [85]:
indicators_imf_dict

{'indicators': {'NGDP_RPCH': {'label': 'Real GDP growth',
   'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at constant prices of final goods and services produced within a country during a specified time period, such as one year.",
   'source': 'World Economic Outlook (October 2023)',
   'unit': 'Annual percent change',
   'dataset': 'WEO'},
  'NGDPD': {'label': 'GDP, current prices',
   'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at current prices of final goods and services produced within a country during a specified time period, such as one year.",
   'source': 'World Economic Outlook (October 2023)',
   'unit': 'Billions of U.S. dollars',
   'dataset': 'WEO'},
  'NGDPDPC': {'label': 'GDP per capita, current prices\n',
   'description': "Gross domestic product is the most

In [101]:
indicator_abbrv = "NGDPDPC"

In [102]:
all_countries = get_data_from_imf(
    "https://www.imf.org/external/datamapper/api/v1/countries"
)
countries = {v["label"]: k for k, v in all_countries["countries"].items()}
indicators_imf_dict = get_data_from_imf(
    "https://www.imf.org/external/datamapper/api/v1/indicators"
)

indicator_label = indicators_imf_dict["indicators"][indicator_abbrv]["label"]
unit = indicators_imf_dict["indicators"][indicator_abbrv]["unit"]
description = indicators_imf_dict["indicators"][indicator_abbrv]["description"]

chunk_1_data_non_cfa = rename_from_abbr_to_full_name(
    get_all_metric_data(NON_CFA_CHUNK_1, indicator_abbrv, countries),
    all_countries,
)  # believe theres a limit to api payload
chunk_2_data_non_cfa = rename_from_abbr_to_full_name(
    get_all_metric_data(NON_CFA_CHUNK_2, indicator_abbrv, countries),
    all_countries,
)
chunk_1_data_non_cfa.update(chunk_2_data_non_cfa)
non_cfa_df = pd.DataFrame.from_dict(chunk_1_data_non_cfa).T
non_cfa_df = remove_outliers(non_cfa_df, find_outliers_IQR(non_cfa_df))
cfa_data = rename_from_abbr_to_full_name(
    get_all_metric_data(CFA_FRANC_ZONE, indicator_abbrv, countries),
    all_countries,
)
cfa_df = pd.DataFrame.from_dict(cfa_data).T

In [103]:
cfa_df = remove_outliers(cfa_df, find_outliers_IQR(cfa_df))
non_cfa_df = remove_outliers(non_cfa_df, find_outliers_IQR(non_cfa_df))

In [66]:
import numpy as np
import scipy.stats as stats
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource

In [186]:
medians = cfa_df.T.median(axis=1)
confidence_intervals = cfa_df.T.apply(
    lambda x: stats.t.interval(
        confidence=0.95,
        df=len(x) - 1,
        loc=np.nanmedian(x),
        scale=stats.sem(x, nan_policy="omit"),
    ),
    axis=1,
)
df_plot = pd.DataFrame(
    {
        "year": medians.index,
        "median": medians.values,
        "lower": [ci[0] for ci in confidence_intervals.values],
        "upper": [ci[1] for ci in confidence_intervals.values],
    }
)
df_plot["year"] = pd.to_datetime(df_plot["year"], format="%Y")

AttributeError: 'DataFrame' object has no attribute 'T'

In [105]:
median_non_cfa = non_cfa_df.T.median(axis=1)
confidence_intervals_non_cfa = non_cfa_df.T.apply(
    lambda x: stats.t.interval(
        confidence=0.95,
        df=len(x) - 1,
        loc=np.nanmedian(x),
        scale=stats.sem(x, nan_policy="omit"),
    ),
    axis=1,
)
df_plot_non_cfa = pd.DataFrame(
    {
        "year": median_non_cfa.index,
        "median": median_non_cfa.values,
        "lower": [ci[0] for ci in confidence_intervals_non_cfa.values],
        "upper": [ci[1] for ci in confidence_intervals_non_cfa.values],
    }
)
df_plot_non_cfa["year"] = pd.to_datetime(df_plot_non_cfa["year"], format="%Y")

In [106]:
# This graph is nonsense
# Convert DataFrame to ColumnDataSource for Bokeh
source = ColumnDataSource(df_plot)
source_non_cfa = ColumnDataSource(df_plot_non_cfa)
# Create a new figure
p = figure(width=800, height=250, x_axis_type="datetime")

# Add confidence interval to the figure as an area
p.line("year", "median", source=source, color="#D55E00", line_width=2)
p.varea(
    x="year", y1="lower", y2="upper", source=source, alpha=0.3, fill_color="#D55E00"
)
# Add median to the figure as a line


p.line("year", "median", source=source_non_cfa, color="#0072B2", line_width=2)
p.varea(
    x="year",
    y1="lower",
    y2="upper",
    source=source_non_cfa,
    alpha=0.3,
    fill_color="#0072B2",
)
# Add median to the figure as a line

show(p)

In [99]:
# Convert DataFrame to ColumnDataSource for Bokeh
source = ColumnDataSource(df_plot)
source_non_cfa = ColumnDataSource(df_plot_non_cfa)
# Create a new figure
p = figure(width=800, height=250, x_axis_type="datetime")

# Add confidence interval to the figure as an area
p.line("year", "median", source=source, color="#D55E00", line_width=2)
p.varea(
    x="year", y1="lower", y2="upper", source=source, alpha=0.3, fill_color="#D55E00"
)
# Add median to the figure as a line


p.line("year", "median", source=source_non_cfa, color="#0072B2", line_width=2)
p.varea(
    x="year",
    y1="lower",
    y2="upper",
    source=source_non_cfa,
    alpha=0.3,
    fill_color="#0072B2",
)
# Add median to the figure as a line

show(p)

In [None]:
medians = df.groupby("year")["gdp"].median()
confidence_intervals = df.groupby("year")["gdp"].apply(
    lambda x: stats.t.interval(
        alpha=0.95, df=len(x) - 1, loc=np.median(x), scale=stats.sem(x)
    )
)

In [3]:
median_cfa_df = remove_future_years(generate_median_df(cfa_df))
median_non_cfa_df = remove_future_years(generate_median_df(non_cfa_df))
merged_df = pd.merge(median_cfa_df, median_non_cfa_df, how="inner", on="year")
merged_df.rename(
    columns={"median_x": "median_cfa", "median_y": "median_non_cfa"}, inplace=True
)
p = generate_graph(merged_df, indicator_label, unit)
show(p)
# intervals_where_median_is_higher, years = analyze_medians(
#     merged_df
# )

In [None]:
requests.get("https://www.imf.org/external/datamapper/api/v1/countries").status_code

In [None]:
indicators_imf_dict

In [None]:
bokeh_pane

In [None]:
bokeh_pane = pn.Column(
    pn.pane.Markdown(
        f"""# Since the 1980s, {indicator_label} comparison between CFA African Franc Zone Countries and Non CFA African Franc Zone Countries""",
        styles={"color": "white"},
    ),
    pn.pane.Bokeh(p),
)

In [None]:
from IPython.display import display

In [None]:
display(bokeh_pane)
test = "a"

In [None]:
rev = merged_df.copy()

In [None]:
ggr = merged_df.copy()

In [None]:
pd.merge(rev, ggr, on="year", how="outer")

In [None]:
ggr

In [None]:
indicators_imf_dict = get_data_from_imf(
    "https://www.imf.org/external/datamapper/api/v1/indicators"
)

In [None]:
indicators_imf_dict["indicators"]["rev"]

In [None]:
indicators_imf_dict["indicators"]["GGR_G01_GDP_PT"]

In [None]:
indicator_label = "Real GDP growth"
unit = "Annual percent change"

In [None]:
cfa_dict = {
    "year": {
        0: 1979,
        1: 1981,
        2: 1982,
        3: 1983,
        4: 1984,
        5: 1985,
        6: 1986,
        7: 1987,
        8: 1988,
        9: 1989,
        10: 1990,
        11: 1991,
        12: 1992,
        13: 1993,
        14: 1994,
        15: 1995,
        16: 1996,
        17: 1997,
        18: 1998,
        19: 1999,
        20: 2000,
        21: 2001,
        22: 2002,
        23: 2003,
        24: 2004,
        25: 2005,
        26: 2006,
        27: 2007,
        28: 2008,
        29: 2009,
        30: 2010,
        31: 2011,
        32: 2012,
        33: 2013,
        34: 2014,
        35: 2015,
        36: 2016,
        37: 2017,
        38: 2018,
        39: 2019,
        40: 2020,
        41: 2021,
        42: 2022,
        43: 2023,
    },
    "median": {
        0: 4.0,
        1: 2.25,
        2: 2.2,
        3: -1.6,
        4: 3.05,
        5: 4.3,
        6: 4.05,
        7: 0.05,
        8: 3.05,
        9: 2.95,
        10: 1.95,
        11: 3.0,
        12: 0.7,
        13: 1.3,
        14: 2.5,
        15: 4.5,
        16: 3.6500000000000004,
        17: 5.5,
        18: 4.9,
        19: 3.5,
        20: 0.5,
        21: 4.55,
        22: 3.75,
        23: 4.2,
        24: 1.4,
        25: 5.4,
        26: 3.7,
        27: 3.4,
        28: 4.25,
        29: 2.8,
        30: 5.5,
        31: 3.35,
        32: 5.8,
        33: 5.15,
        34: 6.0,
        35: 4.35,
        36: 5.0,
        37: 4.65,
        38: 4.35,
        39: 4.55,
        40: 1.15,
        41: 3.35,
        42: 3.75,
        43: 4.1,
    },
}


non_cf_dict = {
    "year": {
        0: 1980,
        1: 1981,
        2: 1982,
        3: 1983,
        4: 1984,
        5: 1985,
        6: 1986,
        7: 1987,
        8: 1988,
        9: 1989,
        10: 1990,
        11: 1991,
        12: 1992,
        13: 1993,
        14: 1994,
        15: 1995,
        16: 1996,
        17: 1997,
        18: 1998,
        19: 1999,
        20: 2000,
        21: 2001,
        22: 2002,
        23: 2003,
        24: 2004,
        25: 2005,
        26: 2006,
        27: 2007,
        28: 2008,
        29: 2009,
        30: 2010,
        31: 2011,
        32: 2012,
        33: 2013,
        34: 2014,
        35: 2015,
        36: 2016,
        37: 2017,
        38: 2018,
        39: 2019,
        40: 2020,
        41: 2021,
        42: 2022,
        43: 2023,
    },
    "median": {
        0: 2.4,
        1: 2.5,
        2: 1.8,
        3: 1.6,
        4: 3.8,
        5: 4.1,
        6: 2.9,
        7: 4.0,
        8: 5.1,
        9: 4.0,
        10: 3.6,
        11: 2.1,
        12: 1.8,
        13: 2.65,
        14: 2.75,
        15: 3.9499999999999997,
        16: 4.65,
        17: 3.75,
        18: 3.15,
        19: 3.2,
        20: 3.8,
        21: 3.75,
        22: 3.5,
        23: 5.25,
        24: 4.85,
        25: 5.7,
        26: 5.699999999999999,
        27: 6.0,
        28: 5.5,
        29: 3.25,
        30: 5.199999999999999,
        31: 5.1,
        32: 4.45,
        33: 4.6,
        34: 4.25,
        35: 3.1500000000000004,
        36: 3.25,
        37: 3.9,
        38: 4.0,
        39: 2.95,
        40: -2.6,
        41: 4.6,
        42: 4.0,
        43: 3.8,
    },
}

In [None]:
med_cfa = pd.DataFrame.from_dict(cfa_dict)
med_non_cfa = pd.DataFrame.from_dict(non_cf_dict)

In [None]:
merge = pd.merge(med_cfa, med_non_cfa, how="inner", on="year")

In [None]:
merge.rename(
    columns={"median_x": "median_cfa", "median_y": "median_non_cfa"}, inplace=True
)

In [None]:
p = generate_graph(merge, indicator_label, unit)
show(p)

In [None]:
p = generate_graph(merge, indicator_label, unit)
show(p)

In [None]:
indicators_imf_dict = get_data_from_imf(
    "https://www.imf.org/external/datamapper/api/v1/indicators"
)

In [None]:
indicators_imf_dict

In [116]:
cfa_data = rename_from_abbr_to_full_name(
    get_all_metric_data(CFA_FRANC_ZONE, indicator_abbrv, countries),
    all_countries,
)
cfa_df = pd.DataFrame.from_dict(cfa_data)

In [121]:
cfa_df.index.name = "year"

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fdf6b6d59a0>

In [133]:
cfa_df.columns.name = "test_count"

In [136]:
# find the quartiles and IQR for each category
groups = cfa_df.groupby("year")
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr

# find the outliers for each category
# def outliers(group):
#     cat = group.name
#     return group[(group.score > upper.loc[cat]['score']) | (group.score < lower.loc[cat]['score'])]['score']
# out = groups.apply(outliers).dropna()

# # prepare outlier data for plotting, we need coordinates for every outlier.
# if not out.empty:
#     outx = list(out.index.get_level_values(0))
#     outy = list(out.values)

p = figure(
    tools="",
    background_fill_color="#efefef",
    x_range=test_count,
    toolbar_location=None,
    source=ColumnDataSource(cfa_df),
)

# if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)
upper.score = [min([x, y]) for (x, y) in zip(list(qmax.loc[:, "score"]), upper.score)]
lower.score = [max([x, y]) for (x, y) in zip(list(qmin.loc[:, "score"]), lower.score)]

# stems
p.segment(cats, upper.score, cats, q3.score, line_color="black")
p.segment(cats, lower.score, cats, q1.score, line_color="black")

# boxes
p.vbar(cats, 0.7, q2.score, q3.score, fill_color="#E08E79", line_color="black")
p.vbar(cats, 0.7, q1.score, q2.score, fill_color="#3B8686", line_color="black")

# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower.score, 0.2, 0.01, line_color="black")
p.rect(cats, upper.score, 0.2, 0.01, line_color="black")

# outliers
if not out.empty:
    p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = "white"
p.grid.grid_line_width = 2
p.xaxis.major_label_text_font_size = "16px"

show(p)

ValueError: failed to validate FigureOptions(...).x_range: expected an element of either Instance(Range), Either(Tuple(Float, Float), Tuple(Datetime, Datetime), Tuple(TimeDelta, TimeDelta)), Seq(String), Object(Series) or Object(GroupBy), got 'test_count'

In [7]:
all_countries = get_data_from_imf(
    "https://www.imf.org/external/datamapper/api/v1/countries"
)
countries = {v["label"]: k for k, v in all_countries["countries"].items()}

In [4]:
indicators_imf_dict = get_data_from_imf(
    "https://www.imf.org/external/datamapper/api/v1/indicators"
)

In [8]:
indicators_imf_dict

{'indicators': {'NGDP_RPCH': {'label': 'Real GDP growth',
   'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at constant prices of final goods and services produced within a country during a specified time period, such as one year.",
   'source': 'World Economic Outlook (October 2023)',
   'unit': 'Annual percent change',
   'dataset': 'WEO'},
  'NGDPD': {'label': 'GDP, current prices',
   'description': "Gross domestic product is the most commonly used single measure of a country's overall economic activity. It represents the total value at current prices of final goods and services produced within a country during a specified time period, such as one year.",
   'source': 'World Economic Outlook (October 2023)',
   'unit': 'Billions of U.S. dollars',
   'dataset': 'WEO'},
  'NGDPDPC': {'label': 'GDP per capita, current prices\n',
   'description': "Gross domestic product is the most

In [40]:
import polars as pl
import polars.selectors as cs

In [11]:
chunk_1_data_non_cfa = rename_from_abbr_to_full_name(
    get_all_metric_data(NON_CFA_CHUNK_1, "NGDP_RPCH", countries), all_countries
)

In [12]:
chunk_2_data_non_cfa = rename_from_abbr_to_full_name(
    get_all_metric_data(NON_CFA_CHUNK_2, "NGDP_RPCH", countries),
    all_countries,
)

In [13]:
chunk_1_data_non_cfa.update(chunk_2_data_non_cfa)

In [129]:
chunk_1_data_non_cfa

{'Burundi': {'1980': -6.8,
  '1981': 12.2,
  '1982': -1.1,
  '1983': 3.7,
  '1984': 0.2,
  '1985': 11.8,
  '1986': 3.2,
  '1987': 5.5,
  '1988': 5,
  '1989': 1.3,
  '1990': 3.5,
  '1991': 5.8,
  '1992': 1,
  '1993': -6.2,
  '1994': -3.8,
  '1995': -7.9,
  '1996': -8,
  '1997': 0.4,
  '1998': 4.8,
  '1999': 1.2,
  '2000': 1.8,
  '2001': 1.7,
  '2002': 2.4,
  '2003': 2.5,
  '2004': 3.8,
  '2005': 4.4,
  '2006': 5.4,
  '2007': 3.5,
  '2008': 4.9,
  '2009': 3.8,
  '2010': 5.1,
  '2011': 4,
  '2012': 4.4,
  '2013': 4.9,
  '2014': 4.2,
  '2015': -3.9,
  '2016': -0.6,
  '2017': 0.5,
  '2018': 1.6,
  '2019': 1.8,
  '2020': 0.3,
  '2021': 3.1,
  '2022': 1.8,
  '2023': 3.3,
  '2024': 6,
  '2025': 5.9,
  '2026': 5.7,
  '2027': 5.9,
  '2028': 5.5},
 'Botswana': {'1980': 12,
  '1981': 8.2,
  '1982': 15.9,
  '1983': 10.8,
  '1984': 6.5,
  '1985': 7.7,
  '1986': 8.6,
  '1987': 14.9,
  '1988': 23.4,
  '1989': 4.7,
  '1990': 8.8,
  '1991': 6.2,
  '1992': -0.2,
  '1993': 4,
  '1994': -0.8,
  '1995': 8.5

In [171]:
noncfa_df = pl.from_dicts(
    data=[
        {"Country": country, **chunk_1_data_non_cfa[country]}
        for country in chunk_1_data_non_cfa
    ],
    schema=[
        "Country",
        "1980",
        "1981",
        "1982",
        "1983",
        "1984",
        "1985",
        "1986",
        "1987",
        "1988",
        "1989",
        "1990",
        "1991",
        "1992",
        "1993",
        "1994",
        "1995",
        "1996",
        "1997",
        "1998",
        "1999",
        "2000",
        "2001",
        "2002",
        "2003",
        "2004",
        "2005",
        "2006",
        "2007",
        "2008",
        "2009",
        "2010",
        "2011",
        "2012",
        "2013",
        "2014",
        "2015",
        "2016",
        "2017",
        "2018",
        "2019",
        "2020",
        "2021",
        "2022",
        "2023",
    ],
).melt(
    id_vars="Country",
    value_vars=cs.numeric(),
    variable_name="Year",
    value_name="GDP",
)

In [175]:
import math

In [182]:
df = pl.DataFrame(
    {
        "a": [1, 8, 3, math.nan],
        "b": [4, 5, 2, math.nan],
        "c": ["foo", "bar", "foo", "d"],
    }
)
df.select(pl.col("a").drop_nans().median())

a
f64
3.0


In [183]:
noncfa_median = (
    noncfa_df.group_by(by="Year", maintain_order=True)
    .drop_nans()
    .median()
    .drop("Country")
)

In [107]:
noncfa_median

<polars.dataframe.group_by.GroupBy at 0x7fdd5bee0f40>

In [108]:
cfa_data = rename_from_abbr_to_full_name(
    get_all_metric_data(CFA_FRANC_ZONE, "NGDP_RPCH", countries),
    all_countries,
)
cfa_df = pl.from_dicts(
    data=[{"Country": country, **cfa_data[country]} for country in cfa_data],
    schema=[
        "Country",
        "1980",
        "1981",
        "1982",
        "1983",
        "1984",
        "1985",
        "1986",
        "1987",
        "1988",
        "1989",
        "1990",
        "1991",
        "1992",
        "1993",
        "1994",
        "1995",
        "1996",
        "1997",
        "1998",
        "1999",
        "2000",
        "2001",
        "2002",
        "2003",
        "2004",
        "2005",
        "2006",
        "2007",
        "2008",
        "2009",
        "2010",
        "2011",
        "2012",
        "2013",
        "2014",
        "2015",
        "2016",
        "2017",
        "2018",
        "2019",
        "2020",
        "2021",
        "2022",
        "2023",
    ],
).melt(
    id_vars="Country",
    value_vars=cs.numeric(),
    variable_name="Year",
    value_name="GDP",
)

In [109]:
cfa_median = cfa_df.group_by(by="Year", maintain_order=True).median().drop("Country")

In [130]:
cfa_median["Year"]

Year
str
"""1980"""
"""1981"""
"""1982"""
"""1983"""
"""1984"""
"""1985"""
"""1986"""
"""1987"""
"""1988"""
"""1989"""


In [119]:
merge_df = noncfa_median.join(cfa_median, on="Year")

In [120]:
merge_df.columns = ["Year", "noncfa_median", "cfa_median"]

In [121]:
merge_df

Year,noncfa_median,cfa_median
str,f64,f64
"""1980""",2.4,4.0
"""1981""",2.5,2.25
"""1982""",1.8,2.2
"""1983""",1.6,-1.6
"""1984""",3.8,3.05
"""1985""",4.1,4.3
"""1986""",2.9,4.05
"""1987""",4.0,0.05
"""1988""",5.1,3.05
"""1989""",4.0,2.95


In [159]:
len(merge_df)

44

In [163]:
test = merge_df.select(
    (pl.col("noncfa_median") > pl.col("cfa_median"))
    .sum()
    .alias("number_of_times_non_cfa_greater"),
    (pl.col("cfa_median") > pl.col("noncfa_median"))
    .sum()
    .alias("number_of_times_cfa_greater"),
)

In [170]:
test["number_of_times_non_cfa_greater"][0]

22

In [122]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, curdoc
from bokeh.palettes import HighContrast
from bokeh.models import (
    Legend,
    HoverTool,
    PrintfTickFormatter,
    ColumnDataSource,
    Range1d,
    Title,
    BasicTickFormatter,
)

In [139]:
p = figure(
    x_axis_label="Year",
    y_axis_label="GDP",
    width=800,
    height=400,
    toolbar_location=None,
)
p.line(
    x="Year",
    y="noncfa_median",
    color="#D55E00",
    line_width=2,
    legend_label="Non-CFA",
    source=ColumnDataSource(merge_df.to_dict(as_series=False)),
    line_alpha=0.7,
)
p.line(
    x="Year",
    y="cfa_median",
    color="#0072B2",
    line_width=2,
    legend_label="CFA",
    source=ColumnDataSource(merge_df.to_dict(as_series=False)),
    line_alpha=0.7,
)
for legend in p.legend:
    p.add_layout(legend, "right")

hover = HoverTool(
    tooltips=[
        ("Year", "@Year"),
        (f"Median (Non-CFA)", "@noncfa_median{0.00}"),
        (f"Median (CFA)", "@cfa_median{0.00}"),
    ]
)
p.add_tools(hover)

p.add_layout(
    Title(
        text="CFA African Countries vs. Non-CFA African Countries\n\n",
        text_font_size="12pt",
        text_align="center",
        align="center",
        text_font_style="normal",
    ),
    "above",
)
p.add_layout(
    Title(
        text=f"Median ",
        text_font_size="18pt",
        text_align="center",
        align="center",
    ),
    "above",
)
p.title.offset = 200
p.title.align = "center"

# p.xgrid.grid_line_color = "#DDDDDD"
# p.ygrid.grid_line_color = "#DDDDDD"

p.axis.minor_tick_line_color = None  # turn off x-axis minor ticks
# p.axis.major_tick_line_alpha =  0.1
p.axis.major_tick_line_color = "#AAAAAA"
p.axis.major_tick_line_dash = "dashed"

p.axis.axis_label_text_font_size = "12pt"
p.axis.axis_label_text_font_style = "bold"
p.axis.major_label_text_font_size = "12px"
p.axis.axis_label_standoff = 20
p.xaxis.major_label_orientation = 1.0

p.legend.border_line_color = None
p.legend.border_line_alpha = 0
p.legend.click_policy = "hide"

p.min_border = 100

In [140]:
show(p)