In [None]:
import pandas as pd
import statsmodels.api as sm

In [None]:
def linear_regression(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, ft_in_use):
    X_train = X_train[ft_in_use]
    X_test = X_test[ft_in_use]

    model = sm.OLS(y_train, sm.add_constant(X_train))
    results = model.fit()
    print(results.summary())

    y_pred = model.predict(sm.add_constant(X_test))
    print(f'MSE: {sum((y_test - y_pred) ** 2) / len(y_test)}')

In [None]:
# process data here
X_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

ft_predict = ['new_deaths_per_million', 'new_cases_per_million']
ft_ignore = [] # add ignore column names here

features = set(X_train.columns()).difference(ft_predict + ft_ignore)

y_train = X_train[ft_predict]
X_train = X_train[features]

y_test = X_test[ft_predict]
X_test = X_test[features]

In [None]:
def new_deaths_per_million(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, ft_in_use):
    linear_regression(X_train, y_train['new_deaths_per_million'], X_test, y_test['new_deaths_per_million'], ft_in_use)

In [None]:
def new_cases_per_million(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame, ft_in_use):
    linear_regression(X_train, y_train['new_cases_per_million'], X_test, y_test['new_cases_per_million'], ft_in_use)

In [None]:
# ordinary linear regression

new_deaths_per_million(X_train, y_train, X_test, y_test, features)
new_cases_per_million(X_train, y_train, X_test, y_test, features)

In [None]:
# reorder columns according to p-values
# save to `features` variable

features = []

In [None]:
# stepwise forward regression

ft_in_use = []
for col in features:
    ft_in_use.append(col)

    print(f'Added ft:  {col}')
    print(f'Ft in use: {ft_in_use}')

    new_deaths_per_million(X_train, y_train, X_test, y_test, ft_in_use)
    new_cases_per_million(X_train, y_train, X_test, y_test, ft_in_use)

    print('*' * 20)

In [None]:
# stepwise backward regression

ft_in_use = features
for col in features:
    ft_in_use.remove(col)

    print(f'Removed ft: {col}')
    print(f'Ft in use:  {ft_in_use}')
    
    new_deaths_per_million(X_train, y_train, X_test, y_test, ft_in_use)
    new_cases_per_million(X_train, y_train, X_test, y_test, ft_in_use)

    print('*' * 20)