[Reference](https://medium.com/analytics-vidhya/a-complete-guide-to-choose-the-correct-cross-validation-technique-d70810a02f27)

In [1]:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
logs=[]
kf=KFold(n_splits=10, shuffle=True, random_state=None)
for train_index, test_index in kf.split(X):
    lr=LinearRegression()
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr.fit(X_train, y_train)
    y_pred=lr.predict(X_test)
    logs.append(mean_squared_error(y_test, y_pred))
print(logs)

[28.528701240448356, 29.49061510044379, 25.325225980481633, 17.582784667936913, 29.641742674891233, 14.4683397981778, 27.67230236385361, 21.790811561513443, 12.252709130920584, 24.15273082412564]


In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import numpy as np
logs=[]
skf=StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
bins=np.histogram_bin_edges(y,bins='sturges')
y_binned=np.digitize(y, bins)
for train_index, test_index in skf.split(X,y_binned):
    lr=LinearRegression()
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr.fit(X_train, y_train)
    y_pred=lr.predict(X_test)
    logs.append(mean_squared_error(y_test, y_pred))
print(logs)

[24.922460221564222, 39.14312903225251, 28.757711803873544, 20.380029737083692, 22.89808528503142, 16.765315405987746, 18.11485083650754, 19.970368490411655, 30.06831831410914, 15.733572755979043]




In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error
logs=[]
leaveCV=LeaveOneOut()
for train_index, test_index in leaveCV.split(X):
    lr=LinearRegression()
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr.fit(X_train, y_train)
    y_pred=lr.predict(X_test)
    logs.append(mean_squared_error(y_test, y_pred))
print(logs)

[37.297971875343414, 11.999608255449107, 17.48782798315517, 23.71101332951509, 70.41997995859823, 12.218524712134368, 0.010660782622617685, 60.83232825911713, 27.606196018152, 0.00043549252925535444, 17.189112931261572, 7.572428095909202, 0.6522578051226872, 0.7377725756911169, 1.2089096788857383, 0.37339619594118306, 6.923948755372523, 0.354752743882845, 16.970137529053165, 0.0435547120534005, 1.201640440541533, 3.8217033945233525, 0.41353651864910074, 0.4993997774839619, 0.006331467725338001, 0.2721471613364922, 1.331116296599162, 0.00865299885868986, 1.3623182494350095, 0.015747259861483686, 1.6115192820340154, 13.134533293759368, 20.67570132353782, 1.4439756870515947, 0.04475714474155397, 24.636017719717454, 5.592485219391444, 4.552686473500261, 3.286604666944488, 0.3312408480711908, 0.5014142358133377, 2.1309795226518378, 0.009679261247387582, 0.008527548522858577, 3.100071074767189, 8.032073243842643, 0.1844214701056847, 2.15724846827906, 31.085736581054984, 4.97324456403012, 2.5

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import numpy as np
logs=[]
bins=np.histogram_bin_edges(y,bins='sturges')
groups=np.digitize(y, bins)
gkf=GroupKFold(n_splits=len(bins))
for train_index, test_index in gkf.split(X,y, groups):
    lr=LinearRegression()
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr.fit(X_train, y_train)
    y_pred=lr.predict(X_test)
    logs.append(mean_squared_error(y_test, y_pred))
print(logs)

[13.435741940036843, 18.162687795613927, 19.79279994372801, 24.338136450019856, 17.039324025352933, 14.834757207726508, 28.769711865897833, 341.4175797451019, 66.20612479691246, 16.39574338296615, 90.59361087900052]


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import numpy as np
import itertools
logs=[]
skf=StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
bins=np.histogram_bin_edges(y,bins='sturges')
y_binned=np.digitize(y, bins)
train_indices=[]
test_indices=[]
for train_index, test_index in skf.split(X,y_binned):
    train_indices.append(train_index.tolist())
    test_indices.append(test_index.tolist())

test_index=test_indices[0]
X_test=X[test_index]
y_test=y[test_index]
indices=test_indices[1:]+train_indices
indices=list(itertools.chain(*indices))
lr=LinearRegression()
lr.fit(X[indices], y[indices])
y_pred=lr.predict(X_test)
logs.append(mean_squared_error(y_test, y_pred))
print(logs)

[20.317725093353832]


