In [528]:
import pandas as pd
import statsmodels.api as sm

In [529]:
df = pd.read_csv(r'./owid-2022-clean.csv')

In [530]:
# Drop irrelevant fields
print(df.columns.shape)
df.drop(["location", "date", "total_deaths_per_million", "total_cases_per_million", "population", "tests_units"], axis=1, inplace=True)

print(df.columns.shape)

(31,)
(25,)


In [531]:
# applied_cols = list(df.columns)
# df_group_iso = df.groupby(["iso_code"])
# for col_name, col_data in df.iteritems():
#     if col_name == "iso_code" or col_name=="continent":
#         continue
#     df[col_name] = df_group_iso[col_name].apply(lambda x: x.fillna(x.mean()))


# df_group_cont = df.groupby(["continent"])
# for col_name, col_data in df.iteritems():
#     if col_name == "iso_code" or col_name=="continent":
#         continue
#     df[col_name] = df_group_cont[col_name].apply(lambda x: x.fillna(x.mean()))
#     print(col_data, (col_data == 0).all())

In [532]:
applied_cols = list(df.columns)
df_group_iso = df.groupby(["iso_code"])
df_group_cont = df.groupby(["continent"])




for col_name, col_data in df.iteritems():
    if col_name == "iso_code" or col_name=="continent":
        continue
    # replace nan with country median
    df[col_name] = df_group_iso[col_name].apply(lambda x: x.fillna(x.median()))
    # replace nan with continent median
    df[col_name] = df_group_cont[col_name].apply(lambda x: x.fillna(x.median()))

# Only replace zero values when the entire column of the country is 0. This is because countries like Vietnam have days where no deaths are recorded, but other days still record a positive number. This is different to countries where no death numbers are recorded.
df_group_cont_med = df_group_cont.median()
for col_name, col_data in df_group_iso:    
    for col in col_data:
        if (col_data[col] == 0).all():
            target = df.loc[df["iso_code"] == col_name]
            df.loc[df["iso_code"] == col_name, col] = target[col].replace(0, df_group_cont_med.at[target["continent"].iloc[0], col])



In [533]:
indicator_cols = pd.get_dummies(df[["iso_code", "continent"]])
df = pd.concat([df, indicator_cols], axis=1)
df.drop(["iso_code", "continent"], axis=1, inplace=True)

In [534]:
df.to_csv("./data.csv")

In [535]:
def linear_regression(X: pd.DataFrame, y: pd.DataFrame, ft_in_use):
    X = X[ft_in_use]
   

    model = sm.OLS(y, sm.add_constant(X))
    results = model.fit()
    print(results.summary())

    y_pred = results.predict(sm.add_constant(X))
    print(f'MSE: {sum((y - y_pred) ** 2) / len(y)}')
    return results

In [536]:
# process data here
df = pd.read_csv('data.csv')

ft_predict = ['new_deaths_per_million', 'new_cases_per_million']
ft_ignore = ['iso_code', 'location', 'date', 'total_deaths', 'total_cases_per_million', 'total_deaths_per_million', 'population', 'tests_units'] # add ignore column names here

features = list(set(df.columns).difference(ft_predict + ft_ignore))

y = df[ft_predict]
X = df[features]

In [537]:
def new_deaths_per_million(X: pd.DataFrame, y: pd.DataFrame, ft_in_use):
    return linear_regression(X, y['new_deaths_per_million'], ft_in_use)

In [538]:
def new_cases_per_million(X: pd.DataFrame, y: pd.DataFrame, ft_in_use):
    return linear_regression(X, y['new_cases_per_million'], ft_in_use)

In [539]:
# ordinary linear regression

dth = new_deaths_per_million(X, y, features)
cas = new_cases_per_million(X, y, features)

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


                              OLS Regression Results                              
Dep. Variable:     new_deaths_per_million   R-squared:                       0.443
Model:                                OLS   Adj. R-squared:                  0.346
Method:                     Least Squares   F-statistic:                     4.569
Date:                    Sat, 04 Jun 2022   Prob (F-statistic):           1.05e-59
Time:                            20:02:03   Log-Likelihood:                -3517.9
No. Observations:                    1323   AIC:                             7430.
Df Residuals:                        1126   BIC:                             8452.
Df Model:                             196                                         
Covariance Type:                nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


In [540]:
# reorder columns according to p-values
# save to `features` variable

deaths_ft = sorted(features, key=lambda col: dth.pvalues[col])
cases_ft = sorted(features, key=lambda col: cas.pvalues[col])

alpha = 0.05

In [541]:
# stepwise forward regression

def forward_reg(title, features, func):
    if (title is not None):
        print(title)
    
    ft_in_use = []
    for col in features:
        ft_in_use.append(col)

        res = func(X, y, ft_in_use)

        print(f'Added ft:  {col}')
        print(f'Ft in use: {ft_in_use}')

        if (max(res.pvalues) > alpha):
            print(f'Break due to stopping rule: max pvalue = {max(res.pvalues)} > {alpha}')
            break

        print('*' * 20)

In [542]:
# stepwise forward regression
forward_reg('New deaths per million model:', deaths_ft, new_deaths_per_million)
forward_reg('New cases per million model:', cases_ft, new_cases_per_million)

New deaths per million model:
                              OLS Regression Results                              
Dep. Variable:     new_deaths_per_million   R-squared:                       0.073
Model:                                OLS   Adj. R-squared:                  0.073
Method:                     Least Squares   F-statistic:                     104.6
Date:                    Sat, 04 Jun 2022   Prob (F-statistic):           1.10e-23
Time:                            20:02:04   Log-Likelihood:                -3854.5
No. Observations:                    1323   AIC:                             7713.
Df Residuals:                        1321   BIC:                             7723.
Df Model:                               1                                         
Covariance Type:                nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


AttributeError: 'OLSResults' object has no attribute 'pvalue'

In [None]:
# stepwise backward regression

def backward_reg(title, features, func):
    if (title is not None):
        print(title)
    
    ft_in_use = features
    for col in features:
        ft_in_use.remove(col)

        res = func(X, y, ft_in_use)

        print(f'Removed ft:  {col}')
        print(f'Ft in use: {ft_in_use}')

        if (max(res.pvalues) < alpha):
            print(f'Break due to stopping rule: max pvalue = {max(res.pvalues)} < {alpha}')
            break

        print('*' * 20)


In [None]:
# stepwise backward regression
backward_reg('New deaths per million model:', deaths_ft, new_deaths_per_million)
backward_reg('New cases per million model:', cases_ft, new_cases_per_million)