In [1]:
# impact of leaking information in the cross validation varies depending on the nature of the preprocessgin step
# let's consider synthentic regression task with 100 samples and 1,000 features as


In [4]:
import numpy as np
rnd = np.random.RandomState(seed = 0)
X = rnd.normal(size = (100, 10000))
y = rnd.normal(size = (100,))

In [9]:
# task here
# select the most informative of the 10 features using SelectPercentile feature selection
# and then evaluate a Ridge regressor using cross validation

In [11]:
from sklearn.feature_selection import SelectPercentile, f_regression

select = SelectPercentile(score_func = f_regression, percentile=5).fit(X, y)
X_selected = select.transform(X)
print("X_selected.shape: {}".format(X_selected.shape))

X_selected.shape: (100, 500)


In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
print("Cross Validation accuracy: {:.2f}".format(np.mean(cross_val_score(Ridge(), X_selected, y, cv = 5))))

Cross Validation accuracy: 0.91


In [13]:
# 0.91 is good accuracy but for a random numer - not justified
# what happened here is the feature selection picked out some features among 10,000 random features that are by chance correlated with the target

# becuase I fit the feature selection outside the cross-validation 
# infomration we leaked from test folds was very informative, leading to highly unrelaistic results



In [15]:
# the correct way 
# proper cross-validation using pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline([("select", SelectPercentile(score_func=f_regression, percentile=5)), ("ridge", Ridge())])

print("Cross- validation accuracy (pipeline): {}".format(np.mean(cross_val_score(pipe, X, y, cv = 5))))

Cross- validation accuracy (pipeline): -0.24655422384952802


In [16]:
# it indicates very poor model
# feature selection is inside the cross validation loop
# features can only be selected using training folds, not the test folds

# data leakage issue in feature selection makes the difference between concluding that a 
# model works very well and concluding that a model works not at all