# Cross Validation and Model Selection

In [1]:
import SP500models as models
import pandas as pd
import numpy as np
import pickle

In [2]:
closing_prices = pd.read_pickle("./data/stocks/closing_prices.pkl")
closing_prices_l2 = closing_prices / closing_prices.apply(lambda x: np.linalg.norm(x))

In [3]:
base1 = models.one_cluster(closing_prices_l2)
base2 = models.different_clusters(closing_prices_l2)
base3 = models.industry(closing_prices_l2)
# base4 is a random assignment of clusters - it's not serious
with open('./models/test_model.pkl', 'rb') as f:
    random_model = pickle.load(f)

In [4]:
prices_train = closing_prices.iloc[:-7]
prices_test = closing_prices.iloc[-7:]

In [5]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

from SP500metrics import WCSS #home-build WCSS function, because I can't find one online

In [6]:
n_splits = 5
cv = TimeSeriesSplit(n_splits = n_splits, test_size = 3)

scores = np.zeros((4, n_splits))
i = 0
for train_index, test_index in cv.split(X = prices_train):
    ## Get the training and holdout sets
    prices_tt = prices_train.iloc[train_index]
    prices_ho = prices_train.iloc[test_index]
    
    ## cross validation happens here
    base1 = models.one_cluster(prices_tt)
    scores[0][i] = WCSS(prices_ho, base1)
    
    base2 = models.different_clusters(closing_prices_l2)
    scores[1][i] = WCSS(prices_ho, base2)

    base3 = models.industry(closing_prices_l2)
    scores[2][i] = WCSS(prices_ho, base3)
    
    i = i + 1

In [None]:
scores.mean(axis = 1)