In [1]:
import ipywidgets as widgets
import numpy as np
import pandas as pd
import plotly_express as px
import statsmodels.api as sm
import statsmodels.stats as stats

from patsy import dmatrix
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats.distributions import t

pd.set_option("display.max_rows", None)

In [2]:
df = pd.read_csv(
    "./data/covid_01_22_2020_to_05_27_2020.csv",
    encoding="utf-8",
)

In [3]:
df["date"] = pd.to_datetime(df["date"], dayfirst=True)

In [4]:
df.columns

Index(['region', 'subregion', 'country', 'area', 'country_area', 'date',
       'confirmed_cases', 'deaths', 'recovered', 'population', 'lat', 'long',
       'lat_long_flag', 'pop_flag', 'active', 'confirmed_cases_rate',
       'deaths_rate', 'recovered_rate', 'active_rate', 'C_xs_T', 'devt_time'],
      dtype='object')

In [5]:
dff = df.loc[
    (df["devt_time"] >= 0),
    [
        "region",
        "subregion",
        "country",
        "country_area",
        "confirmed_cases",
        "population",
        "date",
        "devt_time",
    ],
].copy(deep=True)

dff.sort_values(["country_area", "devt_time"], inplace=True)

dff["data"] = "Observed"

In [6]:
df.loc[df["devt_time"] == -99].groupby("country_area")["confirmed_cases"].max()

country_area
American Samoa           0
Angola                  71
Australia - Other        0
Burma                  206
Burundi                 42
Canada - Other           0
China - Guizhou        147
China - Hebei          328
China - Jilin           65
China - Liaoning       149
China - Qinghai         18
China - Tibet            1
China - Xinjiang        76
China - Yunnan         185
Laos                    19
Lesotho                  2
Nepal                    1
Papua New Guinea         8
US - Other           11750
Viet Nam               327
Name: confirmed_cases, dtype: int64

In [7]:
grpby_dff = dff.groupby(["country_area"]).max()

final_date = np.datetime64("2020-07-01")

for index, row in grpby_dff.iterrows():
    rpt = (final_date - row["date"]) / np.timedelta64(1, "D")

    cty = np.repeat(index, rpt)

    region = np.repeat(row["region"], rpt)

    subregion = np.repeat(row["subregion"], rpt)

    country = np.repeat(row["country"], rpt)

    pop = np.repeat(row["population"], rpt)

    dte = np.arange(
        row["date"] + np.timedelta64(1, "D"),
        final_date + +np.timedelta64(1, "D"),
        dtype="datetime64[D]",
    )

    dev = np.arange(row["devt_time"] + 1, row["devt_time"] + rpt + 1)

    data = np.repeat("Estimate", rpt)

    d = pd.DataFrame(
        {
            "country_area": cty,
            "population": pop,
            "date": dte,
            "devt_time": dev,
            "data": data,
            "subregion": subregion,
            "region": region,
            "country": country,
        }
    )

    dff = pd.concat([dff, d])

In [8]:
cut_off = 30

dff["time_1"] = dff["devt_time"].clip(upper=cut_off)
dff["time_2"] = dff["devt_time"].apply(lambda x: np.maximum(0, x - cut_off))
dff["weight"] = dff.groupby(["country_area"])["confirmed_cases"].shift(
    periods=1, fill_value=0
)
dff["link_ratio"] = dff.apply(
    lambda row: row["confirmed_cases"] / row["weight"] if row["weight"] > 0 else None,
    axis=1,
)

In [9]:
country_list = dff["country_area"].unique().tolist()

plot_data_EDA = dff[dff["link_ratio"] > 1].copy(deep=True)

plot_data_EDA["log of factor"] = np.log(plot_data_EDA["link_ratio"] - 1)

dd_country = widgets.Dropdown(options=country_list)

output_country = widgets.Output()


def dd_country_eventhandler(change):
    output_country.clear_output()

    data = plot_data_EDA[plot_data_EDA["country_area"] == change.new]

    fig = px.scatter(
        data,
        x="devt_time",
        y="log of factor",
        trendline="ols",
        title=change.new,
        template="plotly_white",
        opacity=0.5,
        color="country_area",
        color_discrete_sequence=px.colors.qualitative.D3,
    )

    fig.update_layout(showlegend=False)

    fig.data[1].line = {"dash": "dot", "color": "grey"}

    with output_country:
        display(fig)


dd_country.observe(dd_country_eventhandler, names="value")

display(dd_country)

Dropdown(options=('Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Anguilla', 'Antigua and Barbuda', 'Argentin…

In [10]:
grpby_dff["avg_cases"] = grpby_dff["confirmed_cases"] / (grpby_dff["devt_time"] + 1)

grpby_dff["country_grp"] = grpby_dff.index.where(
    (grpby_dff["avg_cases"] > 50) & (grpby_dff["devt_time"] > 10), "Others"
)

grpby_dff.loc[
    (grpby_dff["country_grp"] == "Others") & (grpby_dff["country"] == "United States"),
    "country_grp",
] = "RoUS"

grpby_dff.loc[
    (grpby_dff["country_grp"] == "Others") & (grpby_dff["country"] == "China"),
    "country_grp",
] = "RoChina"

grpby_dff.loc[
    (grpby_dff["country_grp"] == "Others") & (grpby_dff["region"] == "Europe"),
    "country_grp",
] = "RoEurope"

grpby_dff.loc[(grpby_dff["country"] == "Canada"), "country_grp"] = "Canada"

In [11]:
grpby_dff.sort_values(["avg_cases"], ascending=False).head(20)

Unnamed: 0_level_0,region,subregion,country,confirmed_cases,population,date,devt_time,data,avg_cases,country_grp
country_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Brazil,Latin America And The Caribbean,South America,Brazil,411821,212.559409,2020-05-27,66,Observed,6146.58209,Brazil
Russia,Europe,Eastern Europe,Russia,370680,145.93446,2020-05-27,62,Observed,5883.809524,Russia
US - New York,Northern America,Northern America,United States,364965,19.613675,2020-05-27,80,Observed,4505.740741,US - New York
Spain,Europe,Southern Europe,Spain,284259,46.754783,2020-05-27,83,Observed,3384.035714,Spain
UK,Europe,Northern Europe,United Kingdom,267240,67.886004,2020-05-27,78,Observed,3382.78481,UK
India,Central And Southern Asia,Southern Asia,India,158086,1380.004385,2020-05-27,47,Observed,3293.458333,India
Italy,Europe,Southern Europe,Italy,231818,60.496575,2020-05-27,92,Observed,2492.666667,Italy
Turkey,Northern Africa And Western Asia,Western Asia,Turkey,159797,84.339067,2020-05-27,67,Observed,2349.955882,Turkey
Germany,Europe,Western Europe,Germany,181524,83.783945,2020-05-27,83,Observed,2161.0,Germany
US - New Jersey,Northern America,Northern America,United States,156628,8.955296,2020-05-27,74,Observed,2088.373333,US - New Jersey


In [12]:
mapping = grpby_dff["country_grp"]

dff["country_grp"] = dff["country_area"].map(mapping)

In [13]:
grpby_dff["tail_grp"] = grpby_dff["country"]

grpby_dff.loc[
    grpby_dff["country"].isin(["China", "South Korea"]), "tail_grp"
] = "tail grp 01"

grpby_dff.loc[
    ~grpby_dff["country"].isin(["China", "South Korea"]), "tail_grp"
] = "tail grp 02"

grpby_dff.loc[grpby_dff["country"].isin(["China", "South Korea"])]

Unnamed: 0_level_0,region,subregion,country,confirmed_cases,population,date,devt_time,data,avg_cases,country_grp,tail_grp
country_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
China - Anhui,Eastern And South-Eastern Asia,Eastern Asia,China,991,64.872245,2020-05-27,115,Observed,8.543103,RoChina,tail grp 01
China - Beijing,Eastern And South-Eastern Asia,Eastern Asia,China,593,22.51601,2020-05-27,118,Observed,4.983193,RoChina,tail grp 01
China - Chongqing,Eastern And South-Eastern Asia,Eastern Asia,China,579,31.891631,2020-05-27,118,Observed,4.865546,RoChina,tail grp 01
China - Fujian,Eastern And South-Eastern Asia,Eastern Asia,China,358,40.562007,2020-05-27,112,Observed,3.168142,RoChina,tail grp 01
China - Gansu,Eastern And South-Eastern Asia,Eastern Asia,China,139,27.234935,2020-05-27,59,Observed,2.316667,RoChina,tail grp 01
China - Guangdong,Eastern And South-Eastern Asia,Eastern Asia,China,1592,115.836628,2020-05-27,115,Observed,13.724138,RoChina,tail grp 01
China - Guangxi,Eastern And South-Eastern Asia,Eastern Asia,China,254,50.663616,2020-05-27,68,Observed,3.681159,RoChina,tail grp 01
China - Hainan,Eastern And South-Eastern Asia,Eastern Asia,China,169,9.510447,2020-05-27,117,Observed,1.432203,RoChina,tail grp 01
China - Heilongjiang,Eastern And South-Eastern Asia,Eastern Asia,China,945,39.296713,2020-05-27,111,Observed,8.4375,RoChina,tail grp 01
China - Henan,Eastern And South-Eastern Asia,Eastern Asia,China,1276,99.138896,2020-05-27,114,Observed,11.095652,RoChina,tail grp 01


In [14]:
mapping = grpby_dff["tail_grp"]
dff["tail_grp"] = dff["country_area"].map(mapping)

In [15]:
dff.columns

Index(['region', 'subregion', 'country', 'country_area', 'confirmed_cases',
       'population', 'date', 'devt_time', 'data', 'time_1', 'time_2', 'weight',
       'link_ratio', 'country_grp', 'tail_grp'],
      dtype='object')

In [16]:
dm = dmatrix(
    "-1+country_grp + country_grp:standardize(time_1) + tail_grp:standardize(time_2)",
    dff,
    return_type="dataframe",
)

In [17]:
dm.columns

Index(['country_grp[Afghanistan]', 'country_grp[Algeria]',
       'country_grp[Argentina]', 'country_grp[Armenia]',
       'country_grp[Austria]', 'country_grp[Azerbaijan]',
       'country_grp[Bahrain]', 'country_grp[Bangladesh]',
       'country_grp[Belarus]', 'country_grp[Belgium]',
       ...
       'country_grp[US - Texas]:standardize(time_1)',
       'country_grp[US - Utah]:standardize(time_1)',
       'country_grp[US - Virginia]:standardize(time_1)',
       'country_grp[US - Washington]:standardize(time_1)',
       'country_grp[US - Wisconsin]:standardize(time_1)',
       'country_grp[Ukraine]:standardize(time_1)',
       'country_grp[United Arab Emirates]:standardize(time_1)',
       'country_grp[Uzbekistan]:standardize(time_1)',
       'tail_grp[tail grp 01]:standardize(time_2)',
       'tail_grp[tail grp 02]:standardize(time_2)'],
      dtype='object', length=246)

In [18]:
dm.shape

(31712, 246)

In [19]:
di = dm.design_info

dm_di = pd.DataFrame({"col_name": di.column_names})

factors = [None] * len(di.column_names)

for elt in di.term_names:
    slice_ = di.term_name_slices[elt]
    factors[slice_] = [elt] * len(factors[slice_])

dm_di["factors"] = factors

dm_di["levels"] = dm_di["col_name"].str.findall(r"\((.*)\)|\[(.*)\]")

dm_di["levels"] = dm_di["levels"].apply(lambda x: str(x))

pattern = "|".join(["\[", "\]", "\(", "\)", "'", "T\.", ","])

dm_di["levels"] = dm_di["levels"].str.replace(pattern, "").str.strip()

In [20]:
dm_di.head()

Unnamed: 0,col_name,factors,levels
0,country_grp[Afghanistan],country_grp,"[('', 'Afghanistan')]"
1,country_grp[Algeria],country_grp,"[('', 'Algeria')]"
2,country_grp[Argentina],country_grp,"[('', 'Argentina')]"
3,country_grp[Armenia],country_grp,"[('', 'Armenia')]"
4,country_grp[Austria],country_grp,"[('', 'Austria')]"


In [21]:
dm_obs = dm.loc[
    (dff["data"] == "Observed") & (dff["devt_time"] > 0) & (dff["link_ratio"] > 1)
]

dm_obs_ind = dm_obs.where(dm == 0, other=1)

wgt_obs = dff[
    (dff["data"] == "Observed") & (dff["devt_time"] > 0) & (dff["link_ratio"] > 1)
]["weight"]

wgt_obs = wgt_obs / wgt_obs.sum()

dm_weight = dm_obs_ind.T @ wgt_obs

dm_weight = dm_weight.rename("factor_weight")

dm_di = dm_di.merge(dm_weight, how="left", left_on="col_name", right_index=True)

In [22]:
dm_di.head()

Unnamed: 0,col_name,factors,levels,factor_weight
0,country_grp[Afghanistan],country_grp,"[('', 'Afghanistan')]",0.00102
1,country_grp[Algeria],country_grp,"[('', 'Algeria')]",0.001325
2,country_grp[Argentina],country_grp,"[('', 'Argentina')]",0.001533
3,country_grp[Armenia],country_grp,"[('', 'Armenia')]",0.000827
4,country_grp[Austria],country_grp,"[('', 'Austria')]",0.005094


In [23]:
X = dm.loc[
    (dff["data"] == "Observed") & (dff["devt_time"] > 0) & (dff["link_ratio"] > 1)
]

y = (
    dff[(dff["data"] == "Observed") & (dff["devt_time"] > 0) & (dff["link_ratio"] > 1)][
        "link_ratio"
    ]
    - 1
)

wgt = dff[
    (dff["data"] == "Observed") & (dff["devt_time"] > 0) & (dff["link_ratio"] > 1)
]["weight"]

tt = TransformedTargetRegressor(
    regressor=LinearRegression(fit_intercept=False), func=np.log, inverse_func=np.exp
)

tt.fit(X, y)

In [24]:
dm_di["coef"] = tt.regressor_.coef_

dm_di.head()

Unnamed: 0,col_name,factors,levels,factor_weight,coef
0,country_grp[Afghanistan],country_grp,"[('', 'Afghanistan')]",0.00102,-3.460029
1,country_grp[Algeria],country_grp,"[('', 'Algeria')]",0.001325,-3.845329
2,country_grp[Argentina],country_grp,"[('', 'Argentina')]",0.001533,-3.62692
3,country_grp[Armenia],country_grp,"[('', 'Armenia')]",0.000827,-3.416067
4,country_grp[Austria],country_grp,"[('', 'Austria')]",0.005094,-4.745699


In [25]:
pred = dff.loc[
    (dff["data"] == "Observed") & (dff["devt_time"] > 0) & (dff["link_ratio"] > 1),
    ["country_area", "country_grp", "devt_time", "weight", "link_ratio"],
]

pred["y"] = np.log(pred["link_ratio"] - 1)

pred["y_hat"] = np.log(tt.predict(X))

pred["residuals"] = pred["y"] - pred["y_hat"]

In [26]:
residuals = pred["residuals"].to_numpy()

W = np.diag(pred["weight"].to_numpy())

residuals = residuals[np.newaxis]

w_rss = residuals @ W @ residuals.T

w_sigma_squared_hat = w_rss[0, 0] / (X.shape[0] - X.shape[1])

sq_m = X.to_numpy().T @ W @ X.to_numpy()

w_var_beta_hat = np.linalg.inv(sq_m) * w_sigma_squared_hat

w_stderr = np.sqrt(np.diag(w_var_beta_hat))

In [27]:
alpha = 0.05

tval = t.ppf(1.0 - alpha / 2.0, X.shape[0] - X.shape[1])

In [28]:
dm_di["w_stderr"] = w_stderr

dm_di["ci_95_upper"] = tt.regressor_.coef_ + w_stderr * tval

dm_di["ci_95_lower"] = tt.regressor_.coef_ - w_stderr * tval

dm_di.head()

Unnamed: 0,col_name,factors,levels,factor_weight,coef,w_stderr,ci_95_upper,ci_95_lower
0,country_grp[Afghanistan],country_grp,"[('', 'Afghanistan')]",0.00102,-3.460029,0.191044,-3.085557,-3.834502
1,country_grp[Algeria],country_grp,"[('', 'Algeria')]",0.001325,-3.845329,0.160297,-3.531127,-4.159532
2,country_grp[Argentina],country_grp,"[('', 'Argentina')]",0.001533,-3.62692,0.161345,-3.310663,-3.943177
3,country_grp[Armenia],country_grp,"[('', 'Armenia')]",0.000827,-3.416067,0.266842,-2.893022,-3.939112
4,country_grp[Austria],country_grp,"[('', 'Austria')]",0.005094,-4.745699,0.108484,-4.533057,-4.958342


In [29]:
len(dm_di.loc[dm_di["factors"] == "country_grp", "levels"].unique())

122

In [30]:
factors_list_2 = dm_di["factors"].unique().tolist()

dd_factors_2 = widgets.Dropdown(options=factors_list_2)

output_factors_2 = widgets.Output()


def dd_factors_2_eventhandler(change):
    output_factors_2.clear_output()

    plot_data = dm_di.loc[(dm_di["factors"] == change.new)]

    fig = px.line(
        plot_data,
        x="levels",
        y="coef",
        template="plotly_white",
        color_discrete_sequence=px.colors.qualitative.D3,
        title=change.new,
    )
    fig.add_bar(
        x=plot_data["levels"],
        y=plot_data["factor_weight"],
        name="weight",
        yaxis="y2",
        opacity=0.75,
        marker_color="lightgrey",
    )

    fig.add_scatter(
        x=plot_data["levels"],
        y=plot_data["ci_95_upper"],
        line_dash="dash",
        name="CI Upper",
        marker_color=px.colors.qualitative.D3[1],
    )

    fig.add_scatter(
        x=plot_data["levels"],
        y=plot_data["ci_95_lower"],
        line_dash="dash",
        name="CI Lower",
        marker_color=px.colors.qualitative.D3[1],
    )

    fig.update_layout(
        yaxis2=dict(
            title="weight", anchor="x", overlaying="y", side="right", showgrid=False
        )
    )

    with output_factors_2:
        display(fig)


dd_factors_2.observe(dd_factors_2_eventhandler, names="value")

display(dd_factors_2)

Dropdown(options=('country_grp', 'country_grp:standardize(time_1)', 'tail_grp:standardize(time_2)'), value='co…

In [31]:
beta_hat = (
    np.linalg.inv(X.to_numpy().T @ np.diag(wgt) @ X.to_numpy())
    @ X.to_numpy().T
    @ np.diag(wgt)
    @ np.log(y.values)
)

y_arr = np.log(y.to_numpy())

y_arr_hat = X.to_numpy() @ beta_hat

y_m = y_arr.T @ wgt.to_numpy() / wgt.sum()

ss_tot = (y_arr - y_m) @ np.diag(wgt) @ (y_arr - y_m).T

res = y_arr - y_arr_hat

ss_res = res @ np.diag(wgt) @ res.T

r2 = 1 - ss_res / ss_tot

r2_adj = 1 - (ss_res / ss_tot) * ((X.shape[0] - 1) / (X.shape[0] - X.shape[1] - 1))

sigma2_hat = (res @ np.diag(wgt) @ res.T) / (X.shape[0] - X.shape[1])

var_hat = np.linalg.inv(X.to_numpy().T @ np.diag(wgt) @ X.to_numpy()) * sigma2_hat

stderr_hat = np.sqrt(np.diag(var_hat))

In [32]:
mod_wls = sm.WLS(np.log(y), X, weights=wgt)

res_wls = mod_wls.fit()

print(res_wls.summary())

                            WLS Regression Results                            
Dep. Variable:             link_ratio   R-squared:                       0.813
Model:                            WLS   Adj. R-squared:                  0.810
Method:                 Least Squares   F-statistic:                     250.9
Date:                Mon, 25 Sep 2023   Prob (F-statistic):               0.00
Time:                        18:19:00   Log-Likelihood:                -26960.
No. Observations:               14372   AIC:                         5.441e+04
Df Residuals:                   14126   BIC:                         5.627e+04
Df Model:                         245                                         
Covariance Type:            nonrobust                                         
                                                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [33]:
dff["link_ratio_predicted"] = 1 + tt.predict(dm)

In [34]:
country_list_4 = dff["country_area"].unique().tolist()

dd_country_4 = widgets.Dropdown(options=country_list_4)

output_country_4 = widgets.Output()


def dd_country_4_eventhandler(change):
    output_country_4.clear_output()
    plot_data = dff.loc[(dff["country_area"] == change.new) & (dff["devt_time"] > 0)]
    fig = px.line(
        plot_data,
        x="devt_time",
        y="link_ratio",
        template="plotly_white",
        color_discrete_sequence=px.colors.qualitative.D3,
        title=change.new,
    )

    fig.add_scatter(
        x=plot_data["devt_time"],
        y=plot_data["link_ratio_predicted"],
        line_dash="dash",
        marker_color=px.colors.qualitative.D3[1],
    )

    fig.update_layout(
        yaxis_title="Link Ratio / Daily Development Factor ", showlegend=False
    )

    with output_country_4:
        display(fig)


dd_country_4.observe(dd_country_4_eventhandler, names="value")

display(dd_country_4)

Dropdown(options=('Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Anguilla', 'Antigua and Barbuda', 'Argentin…

In [35]:
dff["lr_cumul"] = (
    dff.loc[dff["data"] == "Estimate"]
    .groupby(["country_area"])["link_ratio_predicted"]
    .cumprod()
    .reset_index(drop=True)
)

dff["pred_cases"] = dff["lr_cumul"] * dff.groupby(["country_area"])[
    "confirmed_cases"
].transform("max")

dff["all_cases"] = dff["confirmed_cases"].fillna(0) + dff["pred_cases"].fillna(0)

dff["all_cases_inc"] = dff.groupby(["country_area"])["all_cases"].diff()

dff.loc[dff["all_cases_inc"].isnull(), "all_cases_inc"] = dff.loc[
    dff["all_cases_inc"].isnull(), "all_cases"
]

In [36]:
country_list_5 = dff["country"].unique().tolist()

dd_country_5 = widgets.Dropdown(options=country_list_5)

output_country_5 = widgets.Output()


def dd_country_5_eventhandler(change):
    output_country_5.clear_output()

    plot_data = (
        dff.loc[dff["country"] == change.new]
        .groupby(["date", "data"])
        .sum()
        .reset_index()
    )

    fig = px.bar(
        plot_data,
        x="date",
        y="all_cases_inc",
        color="data",
        template="plotly_white",
        color_discrete_sequence=px.colors.qualitative.D3,
        title=change.new,
    )

    fig.update_layout(yaxis_title="Daily New Confirmed Cases")

    fig2 = px.bar(
        plot_data,
        x="date",
        y="all_cases",
        color="data",
        template="plotly_white",
        color_discrete_sequence=px.colors.qualitative.D3,
        title=change.new,
    )

    fig2.update_layout(yaxis_title="Cumulative Confirmed Cases")

    with output_country_5:
        display(fig)
        display(fig2)


dd_country_5.observe(dd_country_5_eventhandler, names="value")

display(dd_country_5)

Dropdown(options=('Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Anguilla', 'Antigua and Barbuda', 'Argentin…

In [37]:
china_southkorea_vect = dm_di["levels"].str.contains("China|South Korea", regex=True)

In [38]:
dm_di["country_grp_2"] = None

dm_di.loc[china_southkorea_vect, "country_grp_2"] = "grp 01"

s = stats.weightstats.DescrStatsW(
    dm_di.loc[(dm_di["factors"] == "country_grp") & (~china_southkorea_vect), "coef"],
    dm_di.loc[
        (dm_di["factors"] == "country_grp") & (~china_southkorea_vect), "factor_weight"
    ],
)

bins = s.quantile(np.linspace(0, 1, 11, True)).to_list()

fbins = [
    bins[i] - 0.000000001 if bins[i] == bins[i + 1] else bins[i]
    for i in range(len(bins) - 1)
]

fbins[-1] = bins[-1]

In [39]:
dm_di.loc[
    (dm_di["factors"] == "country_grp") & (~china_southkorea_vect), "country_grp_2"
] = pd.cut(
    dm_di.loc[(dm_di["factors"] == "country_grp") & (~china_southkorea_vect), "coef"],
    fbins,
    include_lowest=True,
    labels=["grp " + str(i + 2).zfill(2) for i in range(len(fbins) - 1)],
)

In [40]:
dta = dm_di.loc[(dm_di["factors"] == "country_grp")].sort_values("country_grp_2")

fig = px.box(
    dta,
    x="country_grp_2",
    y="coef",
    points="all",
    template="plotly_white",
    color_discrete_sequence=px.colors.qualitative.D3,
    title="W. Quantile Grouping",
)

fig.show()

In [41]:
dm_di.loc[(dm_di["factors"] == "country_grp")].groupby("country_grp_2")[
    "factor_weight"
].sum()

country_grp_2
grp 01    0.027997
grp 02    0.178123
grp 03    0.204928
grp 04    0.173812
grp 05    0.156993
grp 06    0.161601
grp 07    0.184727
grp 08    0.237005
grp 09    0.097129
grp 10    0.345302
Name: factor_weight, dtype: float64