In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from dataprep.clean import clean_country
import dask.dataframe as dd

In [202]:
# dask is cool with huge csvs
csv_file_path = "/home/amir/big_storage/datasets/IHME-GDB-Risk-all.csv"

risk_df= pd.read_csv(csv_file_path)

risk_df = clean_country(risk_df, 'location', output_format='alpha-3')

  0%|          | 0/91 [00:00<?, ?it/s]

Country Cleaning Report:
	2741220 values cleaned (99.37%)
	17460 values unable to be parsed (0.63%), set to NaN
Result contains 2741220 (99.37%) values in the correct format and 17460 null values (0.63%)


In [203]:
cause_list = [
    "Depressive disorders",
    "Cardiovascular diseases",
    "Chronic respiratory diseases",
    "Neurological disorders",
    "Tracheal, bronchus, and lung cancer",
    "Stomach cancer",
    "Pancreatic cancer",
    "Leukemia",
    "Esophageal cancer",
    "Larynx cancer",
    "Colon and rectum cancer",
    "Liver cancer",
    "Other non-communicable diseases",
]

# cause_list = [
#     "All causes"
# ]


In [204]:

cause_df = risk_df[
    risk_df.cause.isin(cause_list)
    & (risk_df.metric == "Rate")
    & (risk_df.measure == "Deaths")
    & (risk_df.sex == "Both")
    # & (risk_df.year >= 2000)
].sort_values("year").reset_index(drop=True)

global_df = cause_df[cause_df.location == "Global"]
global_df.head()

Unnamed: 0,measure,location,sex,age,cause,rei,metric,year,val,upper,lower,location_clean
1389,Deaths,Global,Both,All Ages,Neurological disorders,All risk factors,Rate,1990,3.153837,9.772785,0.630572,
1530,Deaths,Global,Both,All Ages,"Tracheal, bronchus, and lung cancer",Particulate matter pollution,Rate,1990,4.111372,5.208312,3.079991,
1531,Deaths,Global,Both,All Ages,"Tracheal, bronchus, and lung cancer",All risk factors,Rate,1990,16.758972,17.720711,15.945836,
1532,Deaths,Global,Both,All Ages,Pancreatic cancer,All risk factors,Rate,1990,1.291501,1.548592,1.081802,
1534,Deaths,Global,Both,All Ages,Liver cancer,All risk factors,Rate,1990,2.883647,3.474875,2.350907,


In [205]:
def get_all_vs_air(df, cause=None):
    result_df = [
        df,  # all
        df,  # air
    ]
    for index, poll in enumerate(["All risk factors", "Air pollution"]):
        temp = result_df[index]
        temp = temp[temp.rei == poll]
        if cause:
            temp = temp[temp.cause == cause]
        result_df[index] = temp
    # air_global_df = global_df[global_df.rei == ]
    # all_global_df = global_df[global_df.rei == ]
    
    for index, temp_df in enumerate(result_df):

        result_df[index] = temp_df.groupby("year").agg(
            val = ('val', 'sum')
        ).reset_index()

    # air_global_df = air_global_df.groupby("year").agg(
    #     val = ('val', 'sum')
    # ).reset_index()

    return result_df


In [206]:
specific_cases = [
    None, # scan all
    "Chronic respiratory diseases",
    "Cardiovascular diseases",
]

In [212]:
# fig = px.line(air_global_df, x="year", y="val")
# Create tracesfrom plotly.subplots import make_subplots
from plotly.subplots import make_subplots
def draw_plot(df, where):
    fig = make_subplots(
        rows=3, cols=1, shared_xaxes=True, x_title="year", y_title="Rate (per 100,000)",
        subplot_titles=[i if i else "All Causes" for i in specific_cases ]
    )
    # fig = go.Figure()

    for row, cause in enumerate(specific_cases):
        all_risk_df, air_risk_df = get_all_vs_air(df, cause)

        fig.add_trace(
            go.Scatter(
                x=all_risk_df.year,
                y=all_risk_df.val,
                mode="lines+markers",
                name="All Risk Factor",
            ),
            row=row + 1,
            col=1,
        )
        fig.add_trace(
            go.Scatter(
                x=air_risk_df.year,
                y=air_risk_df.val,
                mode="lines+markers",
                name="Air Pollution",
            ),
            row=row + 1,
            col=1,
        )


    fig.update_layout(
        title=f"Compare Death rate for <b>All Risk Factor</b> vs <b>Air Pollution</b> over the years 1990 to 2019<br>{where}",
        legend_title="Risk Factor",
    )
    fig["layout"]["hovermode"] = "x"
    return fig


In [213]:
draw_plot(global_df, "Global")

In [214]:
china_df = cause_df[cause_df.location_clean=="CHN"]
draw_plot(china_df, where="China")

In [216]:

iran_df = cause_df[cause_df.location_clean=="IRN"]
draw_plot(iran_df, "Iran")