In [637]:
import pandas as pd
import statsmodels.api as sm

In [638]:
df = pd.read_csv(r'./owid-2022-clean.csv')

In [639]:
# Drop irrelevant fields
print(df.columns.shape)
df.drop(["location", "date", "total_deaths_per_million", "total_cases_per_million", "population", "tests_units"], axis=1, inplace=True)

print(df.columns.shape)

(31,)
(25,)


In [640]:
# applied_cols = list(df.columns)
# df_group_iso = df.groupby(["iso_code"])
# for col_name, col_data in df.iteritems():
#     if col_name == "iso_code" or col_name=="continent":
#         continue
#     df[col_name] = df_group_iso[col_name].apply(lambda x: x.fillna(x.mean()))


# df_group_cont = df.groupby(["continent"])
# for col_name, col_data in df.iteritems():
#     if col_name == "iso_code" or col_name=="continent":
#         continue
#     df[col_name] = df_group_cont[col_name].apply(lambda x: x.fillna(x.mean()))
#     print(col_data, (col_data == 0).all())

In [641]:
applied_cols = list(df.columns)
df_group_iso = df.groupby(["iso_code"])
df_group_cont = df.groupby(["continent"])




for col_name, col_data in df.iteritems():
    if col_name == "iso_code" or col_name=="continent":
        continue
    # replace nan with country median
    df[col_name] = df_group_iso[col_name].apply(lambda x: x.fillna(x.median()))
    # replace nan with continent median
    df[col_name] = df_group_cont[col_name].apply(lambda x: x.fillna(x.median()))

# Only replace zero values when the entire column of the country is 0. This is because countries like Vietnam have days where no deaths are recorded, but other days still record a positive number. This is different to countries where no death numbers are recorded.
df_group_cont_med = df_group_cont.median()
for col_name, col_data in df_group_iso:    
    for col in col_data:
        if (col_data[col] == 0).all():
            target = df.loc[df["iso_code"] == col_name]
            df.loc[df["iso_code"] == col_name, col] = target[col].replace(0, df_group_cont_med.at[target["continent"].iloc[0], col])



In [642]:
indicator_cols = pd.get_dummies(df[ "continent"])
df = pd.concat([df, indicator_cols], axis=1)
df.drop(["iso_code", "continent"], axis=1, inplace=True)

In [643]:
df.to_csv("./data.csv")

In [644]:
def linear_regression(X: pd.DataFrame, y: pd.DataFrame, ft_in_use, log=False):
    X = X[ft_in_use]

    model = sm.OLS(y, sm.add_constant(X))
    results = model.fit()

    if (log):
        print(results.summary())

    y_pred = results.predict(sm.add_constant(X))
    print(f'MSE: {sum((y - y_pred) ** 2) / len(y)}')
    return results

In [645]:
df = pd.read_csv('data.csv')

ft_predict = ['new_deaths_per_million', 'new_cases_per_million']
ft_ignore = ['iso_code', 'location', 'date', 'total_deaths', 'total_cases_per_million', 'total_deaths_per_million', 'population', 'tests_units'] # add ignore column names here

features = list(set(df.columns).difference(ft_predict + ft_ignore))

y = df[ft_predict]
X = df[features]

In [646]:
def new_deaths_per_million(X: pd.DataFrame, y: pd.DataFrame, ft_in_use, log = True):
    return linear_regression(X, y['new_deaths_per_million'], ft_in_use)

SyntaxError: invalid syntax (3819791432.py, line 1)

In [None]:
def new_cases_per_million(X: pd.DataFrame, y: pd.DataFrame, ft_in_use, log = True):
    return linear_regression(X, y['new_cases_per_million'], ft_in_use)

In [None]:
# ordinary linear regression

dth = new_deaths_per_million(X, y, features)
cas = new_cases_per_million(X, y, features)

MSE: 16.225021880629296
MSE: 2299560.054719626


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


In [None]:
# reorder columns according to p-values
# save to `features` variable

deaths_ft = sorted(features, key=lambda col: dth.pvalues[col], reverse=True)
cases_ft = sorted(features, key=lambda col: cas.pvalues[col], reverse=True)

alpha = 0.05

In [None]:
# stepwise forward regression

def forward_reg(title, features, func):
    if (title is not None):
        print(title)
    
    ft_in_use = []
    for col in features:
        ft_in_use.append(col)

        res = func(X, y, ft_in_use)

        print(f'Added ft:  {col}')
        print(f'Ft in use: {ft_in_use}')

        if (max(res.pvalues) > alpha):
            print(f'Break due to stopping rule: max pvalue = {max(res.pvalues)} > {alpha}')
            break

        print('*' * 20)

In [None]:
# stepwise forward regression
forward_reg('New deaths per million model:', deaths_ft, new_deaths_per_million)

New deaths per million model:
MSE: 21.268216022993105
Added ft:  new_tests_smoothed_per_thousand
Ft in use: ['new_tests_smoothed_per_thousand']
********************
MSE: 19.740071632813375
Added ft:  life_expectancy
Ft in use: ['new_tests_smoothed_per_thousand', 'life_expectancy']
Break due to stopping rule: max pvalue = 0.722110036284709 > 0.05


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


In [None]:
forward_reg('New cases per million model:', cases_ft, new_cases_per_million)

New cases per million model:
MSE: 3025649.4770217272
Added ft:  gdp_per_capita
Ft in use: ['gdp_per_capita']
********************
MSE: 3022393.2969941846
Added ft:  stringency_index
Ft in use: ['gdp_per_capita', 'stringency_index']
Break due to stopping rule: max pvalue = 0.23327172677883487 > 0.05


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


In [None]:
# stepwise backward regression

def backward_reg(title, features, func):
    if (title is not None):
        print(title)
    
    ft_in_use = features.copy()
    for col in features:
        ft_in_use.remove(col)

        res = func(X, y, ft_in_use)

        print(f'Removed ft:  {col}')
        print(f'Ft in use: {ft_in_use}')

        if (max(res.pvalues) < alpha):
            print(f'Break due to stopping rule: max pvalue = {max(res.pvalues)} < {alpha}')
            break

        print('*' * 20)

In [None]:
# stepwise backward regression
backward_reg('New deaths per million model:', deaths_ft, new_deaths_per_million)

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


New deaths per million model:


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


MSE: 16.225022057349094
Removed ft:  new_tests_smoothed_per_thousand
Ft in use: ['life_expectancy', 'aged_65_older', 'male_smokers', 'North America', 'aged_70_older', 'tests_per_case', 'extreme_poverty', 'stringency_index', 'positive_rate', 'people_vaccinated_per_hundred', 'population_density', 'handwashing_facilities', 'Europe', 'South America', 'cardiovasc_death_rate', 'people_fully_vaccinated_per_hundred', 'diabetes_prevalence', 'Unnamed: 0', 'human_development_index', 'hospital_beds_per_thousand', 'Africa', 'total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
********************
MSE: 16.22561318090926
Removed ft:  life_expectancy
Ft in use: ['aged_65_older', 'male_smokers', 'North America', 'aged_70_older', 'tests_per_case', 'extreme_poverty', 'stringency_index', 'positive_rate', 'people_vaccinated_per_hundred', 'population_density', 'handwashing_facilities', 'Europe', 'South America', 'cardiovasc_death_rate', 'people_fully_vaccinated_

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


MSE: 16.230248734484757
Removed ft:  extreme_poverty
Ft in use: ['stringency_index', 'positive_rate', 'people_vaccinated_per_hundred', 'population_density', 'handwashing_facilities', 'Europe', 'South America', 'cardiovasc_death_rate', 'people_fully_vaccinated_per_hundred', 'diabetes_prevalence', 'Unnamed: 0', 'human_development_index', 'hospital_beds_per_thousand', 'Africa', 'total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
********************
MSE: 16.232275050888603
Removed ft:  stringency_index
Ft in use: ['positive_rate', 'people_vaccinated_per_hundred', 'population_density', 'handwashing_facilities', 'Europe', 'South America', 'cardiovasc_death_rate', 'people_fully_vaccinated_per_hundred', 'diabetes_prevalence', 'Unnamed: 0', 'human_development_index', 'hospital_beds_per_thousand', 'Africa', 'total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
********************
MSE: 16.2380250917116
R

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


********************
MSE: 16.478123304850975
Removed ft:  diabetes_prevalence
Ft in use: ['Unnamed: 0', 'human_development_index', 'hospital_beds_per_thousand', 'Africa', 'total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
********************


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


MSE: 16.518462483610378
Removed ft:  Unnamed: 0
Ft in use: ['human_development_index', 'hospital_beds_per_thousand', 'Africa', 'total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
********************
MSE: 16.561663521092512
Removed ft:  human_development_index
Ft in use: ['hospital_beds_per_thousand', 'Africa', 'total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
********************
MSE: 16.590356933467188
Removed ft:  hospital_beds_per_thousand
Ft in use: ['Africa', 'total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
********************
MSE: 16.983516501432003
Removed ft:  Africa
Ft in use: ['total_boosters_per_hundred', 'gdp_per_capita', 'median_age', 'Asia', 'Oceania', 'female_smokers']
Break due to stopping rule: max pvalue = 0.025519317089049687 < 0.05


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


In [None]:
backward_reg('New cases per million model:', cases_ft, new_cases_per_million)

New cases per million model:


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


MSE: 2299560.511172696
Removed ft:  gdp_per_capita
Ft in use: ['stringency_index', 'people_fully_vaccinated_per_hundred', 'people_vaccinated_per_hundred', 'extreme_poverty', 'handwashing_facilities', 'cardiovasc_death_rate', 'female_smokers', 'aged_65_older', 'North America', 'male_smokers', 'Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'population_density', 'Oceania', 'South America', 'median_age', 'Africa', 'aged_70_older', 'hospital_beds_per_thousand', 'diabetes_prevalence', 'human_development_index', 'total_boosters_per_hundred', 'new_tests_smoothed_per_thousand', 'positive_rate']


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


********************
MSE: 2299572.82211377
Removed ft:  stringency_index
Ft in use: ['people_fully_vaccinated_per_hundred', 'people_vaccinated_per_hundred', 'extreme_poverty', 'handwashing_facilities', 'cardiovasc_death_rate', 'female_smokers', 'aged_65_older', 'North America', 'male_smokers', 'Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'population_density', 'Oceania', 'South America', 'median_age', 'Africa', 'aged_70_older', 'hospital_beds_per_thousand', 'diabetes_prevalence', 'human_development_index', 'total_boosters_per_hundred', 'new_tests_smoothed_per_thousand', 'positive_rate']
********************
MSE: 2299633.770301968
Removed ft:  people_fully_vaccinated_per_hundred
Ft in use: ['people_vaccinated_per_hundred', 'extreme_poverty', 'handwashing_facilities', 'cardiovasc_death_rate', 'female_smokers', 'aged_65_older', 'North America', 'male_smokers', 'Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'population_density', 'Oceania', 'Sout

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


MSE: 2301779.422048805
Removed ft:  cardiovasc_death_rate
Ft in use: ['female_smokers', 'aged_65_older', 'North America', 'male_smokers', 'Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'population_density', 'Oceania', 'South America', 'median_age', 'Africa', 'aged_70_older', 'hospital_beds_per_thousand', 'diabetes_prevalence', 'human_development_index', 'total_boosters_per_hundred', 'new_tests_smoothed_per_thousand', 'positive_rate']
********************
MSE: 2302872.2821137696
Removed ft:  female_smokers
Ft in use: ['aged_65_older', 'North America', 'male_smokers', 'Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'population_density', 'Oceania', 'South America', 'median_age', 'Africa', 'aged_70_older', 'hospital_beds_per_thousand', 'diabetes_prevalence', 'human_development_index', 'total_boosters_per_hundred', 'new_tests_smoothed_per_thousand', 'positive_rate']
********************


  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


MSE: 2303746.64818874
Removed ft:  aged_65_older
Ft in use: ['North America', 'male_smokers', 'Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'population_density', 'Oceania', 'South America', 'median_age', 'Africa', 'aged_70_older', 'hospital_beds_per_thousand', 'diabetes_prevalence', 'human_development_index', 'total_boosters_per_hundred', 'new_tests_smoothed_per_thousand', 'positive_rate']
********************
MSE: 2303746.6481887316
Removed ft:  North America
Ft in use: ['male_smokers', 'Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'population_density', 'Oceania', 'South America', 'median_age', 'Africa', 'aged_70_older', 'hospital_beds_per_thousand', 'diabetes_prevalence', 'human_development_index', 'total_boosters_per_hundred', 'new_tests_smoothed_per_thousand', 'positive_rate']
********************
MSE: 2305253.4934028126
Removed ft:  male_smokers
Ft in use: ['Asia', 'Unnamed: 0', 'Europe', 'life_expectancy', 'tests_per_case', 'populatio

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
