[Reference](https://tomerkatzav.medium.com/split-time-series-dataset-826b7dc39cd9)

In [6]:
from sklearn.datasets import load_iris

In [7]:
iris = load_iris()

In [8]:
X, y = iris.data, iris.target
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]


In [10]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# predefine the variables
n_splits = 5  # Number of splits
model = RandomForestRegressor()  # I used random forest as my model
# Add the parameter to grid serach on - this is just an example
grid_params = {'n_estimators': [int(x) for x in np.linspace(200, 1000, 3)],
              'max_depth': [int(x) for x in np.linspace(5, 55, 11)],
              'max_features': ['auto', 'sqrt', 'log2'],
              'random_state': [42]
              }
refit = True  # Refit an estimator using the best found parameters on the whole dataset
scoring = 'neg_mean_squared_error'  # Strategy to evaluate the performance of the cross-validated model on the test set
n_jobs = -1  # Number of jobs to run in parallel
tscv = TimeSeriesSplit(n_splits=5)
grid_search = GridSearchCV(estimator=model, param_grid=grid_params, refit=refit,
                           scoring=scoring, cv=tscv, n_jobs=n_jobs).fit(X, y)
print(f'Model: {model} best params are: {grid_search.best_params_}')

Model: RandomForestRegressor() best params are: {'max_depth': 5, 'max_features': 'auto', 'n_estimators': 1000, 'random_state': 42}


In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, PredefinedSplit

model = RandomForestRegressor()
grid_params = {'n_estimators': [int(x) for x in np.linspace(200, 1000, 3)],
               'max_depth': [int(x) for x in np.linspace(5, 55, 11)],
               'max_features': ['auto', 'sqrt', 'log2'],
               'random_state': [42]
               }
refit = True
scoring = 'neg_mean_squared_error'
n_jobs = -1
validation_size = 24
X.reset_index(inplace=True)
X.sort_values('date', inplace=True)
train_dates = pd.to_datetime(X['date'].unique()).sort_values()

val_dates = train_dates[-validation_size:]

n_test_obs = X['date'].isin(train_dates).sum()
n_valid_obs = X['date'].isin(val_dates).sum()

test_fold_encoding = list(np.concatenate([np.ones(n_test_obs - n_valid_obs), np.zeros(n_valid_obs)]))

cv = [[c for c in PredefinedSplit(test_fold=test_fold_encoding).split()][0]]

grid_search = GridSearchCV(estimator=model, param_grid=grid_params, refit=refit,
                           scoring=scoring, cv=cv, n_jobs=n_jobs).fit(X, y)
print(f'Model: {model} best params are: {grid_search.best_params_}')
