In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold

## Data Processing

In [2]:
dat = pd.read_csv('features_lookback_10.csv')

# standardize columns
dat_std = dat[dat.loc[:, 'FP_1':'FP_10'].mean(axis='columns')>10].drop(columns=['url_suffix', 'Date']).values
scaler = StandardScaler().fit(dat_std)
dat_std = scaler.transform(dat_std, copy=True)

X = dat_std[:, 1:]
y = dat_std[:, 0]
names = dat.drop(columns=['url_suffix', 'Date', 'FP']).columns

## Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression

kf = KFold(n_splits=10, shuffle=True, random_state=434)
reg = LinearRegression()

cv_scores = cross_val_score(reg, X, y, cv=kf)

np.quantile(cv_scores, [.05, .95])

array([0.50037858, 0.54415229])

## Ridge Regression

In [4]:
from sklearn.linear_model import Ridge

alphas = [.1, 1, 10, 100, 1000]
ridge_scores = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha)

    cv_scores = cross_val_score(ridge, X, y, cv=kf)
    print(f'alpha = {alpha} \tmean = {round(np.mean(cv_scores),3)} \tquantile = {np.quantile(cv_scores, [.5, .95])}')

    ridge_scores.append(cv_scores)    

alpha = 0.1 	mean = 0.519 	quantile = [0.51351022 0.54415139]
alpha = 1 	mean = 0.519 	quantile = [0.51352882 0.54414323]
alpha = 10 	mean = 0.519 	quantile = [0.51369462 0.54409792]
alpha = 100 	mean = 0.519 	quantile = [0.51444996 0.54450372]
alpha = 1000 	mean = 0.518 	quantile = [0.5144447  0.54364007]


## Lasso Regression

In [5]:
from sklearn.linear_model import Lasso

alphas = [.1, .01, .001]
lasso_scores = []

for alpha in alphas:
    lasso = Lasso(alpha=alpha)

    cv_scores = cross_val_score(lasso, X, y, cv=kf)
    print(f'alpha = {alpha} \tmean = {round(np.mean(cv_scores),3)} \tquantile = {np.quantile(cv_scores, [.5, .95])}')

    lasso_scores.append(cv_scores)    

alpha = 0.1 	mean = 0.505 	quantile = [0.49973596 0.52657662]
alpha = 0.01 	mean = 0.519 	quantile = [0.51371688 0.54250593]
alpha = 0.001 	mean = 0.52 	quantile = [0.5151052  0.54469004]


## XGBoost

In [6]:
from xgboost import XGBRegressor

xgb_scores = []
n_param = [4, 7, 10, 13, 16, 19]

for n in n_param:
    xgb = XGBRegressor(n_estimators=n)
    
    cv_scores = cross_val_score(xgb, X, y, cv=kf)
    print(f'n_estimators = {n} \tmean = {round(np.mean(cv_scores), 3)} \tquantile = {np.quantile(cv_scores, [.05, .95])}')

    xgb_scores.append(cv_scores)

n_estimators = 4 	mean = 0.48 	quantile = [0.46617839 0.50352803]
n_estimators = 7 	mean = 0.504 	quantile = [0.48503241 0.52823661]
n_estimators = 10 	mean = 0.504 	quantile = [0.48319867 0.52778644]
n_estimators = 13 	mean = 0.502 	quantile = [0.47956526 0.52602138]
n_estimators = 16 	mean = 0.5 	quantile = [0.47662013 0.52471887]
n_estimators = 19 	mean = 0.499 	quantile = [0.47730978 0.52405774]
