In [1]:
# stratified-kfold for regression 
import numpy as np 
import pandas as pd 
 
from sklearn import datasets 
from sklearn import model_selection 

In [2]:
def create_folds(data): 
    # we create a new column called kfold and fill it with -1 
    data["kfold"] = -1 
     
    # the next step is to randomize the rows of the data 
    data = data.sample(frac=1).reset_index(drop=True) 
 
    # calculate the number of bins by Sturge's rule 
    # I take the floor of the value, you can also 
    # just round it 
    num_bins = np.floor(1 + np.log2(len(data))) 
 
    # bin targets 
    data.loc[:, "bins"] = pd.cut( 
        data["target"], bins=num_bins, labels=False 
    ) 
     
    # initiate the kfold class from model_selection module 
    kf = model_selection.StratifiedKFold(n_splits=5) 
     
    # fill the new kfold column 
    # note that, instead of targets, we use bins! 
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)): 
        data.loc[v_, 'kfold'] = f 
     
    # drop the bins column 
    data = data.drop("bins", axis=1) 
    # return dataframe with folds 
    return data 

In [3]:
%run -i regression.py



In [4]:
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target,kfold
0,-1.126490,-1.553284,0.494187,1.260483,-1.587917,0.510944,-1.002228,1.136515,-0.467737,-1.370911,...,0.649367,0.284980,-0.925219,-0.233989,1.541032,0.402129,0.062009,0.620330,-293.838323,0
1,0.622306,-0.687992,-0.057536,-0.758271,-0.650311,0.136625,-0.754726,-0.823460,0.192457,-0.142087,...,1.323013,-0.549377,0.143526,-1.217437,0.223333,-0.638429,0.907270,-0.814469,77.276623,0
2,0.172326,-0.853100,0.512154,-0.264266,0.222716,-1.061753,-0.123970,-1.115556,0.551501,0.124424,...,-0.700430,-0.273657,1.084582,1.332597,-0.807198,-0.858656,-0.274623,-0.990113,-148.111258,0
3,0.430669,0.770305,-0.310961,1.675132,0.582808,1.600438,0.706230,-0.473649,-0.383242,1.498925,...,1.118541,2.454054,0.464205,-0.725900,0.271818,-0.506499,-0.171014,-0.053778,152.025171,0
4,-1.242559,0.779461,0.240793,1.011974,1.191450,0.485227,0.468215,-1.000605,-0.414683,-0.530416,...,-0.004917,0.252385,-0.359980,-0.850760,0.150173,-1.655075,-0.709439,1.143662,-80.671547,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,-0.333046,0.092292,-0.387273,-1.048538,1.258245,0.040564,1.491801,0.630814,2.671951,-0.160986,...,-0.184628,-1.291075,0.535017,0.808838,-0.056193,-0.397109,-1.350346,-0.105462,82.404151,4
14996,-0.750058,-1.519995,-1.737603,-1.754605,-1.609086,1.003759,1.635527,0.713253,-0.605579,0.291193,...,-1.420559,2.294498,-0.914117,-1.583541,1.587745,-1.948063,-0.174135,-0.550416,-137.414615,4
14997,0.411610,-0.969299,-0.743430,-0.399807,-0.852568,0.593126,1.037459,1.524766,0.518828,-0.857586,...,0.522943,-0.058369,-0.686155,0.587832,0.301744,0.148453,1.010941,-0.506611,-118.447947,4
14998,-0.418944,0.364969,1.658422,0.409682,0.166632,-0.072314,0.769636,2.341911,0.825999,1.154498,...,0.087584,1.167850,-0.669355,1.448787,0.911578,0.246843,0.959968,0.837977,161.568039,4
