In [2]:
# stratified-kfold for regression 
import numpy as np 
import pandas as pd 
 
from sklearn import datasets 
from sklearn import model_selection 

In [3]:
def create_folds(data): 
    # we create a new column called kfold and fill it with -1 
    data["kfold"] = -1 
     
    # the next step is to randomize the rows of the data 
    data = data.sample(frac=1).reset_index(drop=True) 
 
    # calculate the number of bins by Sturge's rule 
    # I take the floor of the value, you can also 
    # just round it 14
    num_bins = np.floor(1 + np.log2(len(data))) 
 
    # bin targets 
    data.loc[:, "bins"] = pd.cut( 
        data["target"], bins=num_bins, labels=False 
    ) 
     
    # initiate the kfold class from model_selection module 
    kf = model_selection.StratifiedKFold(n_splits=5) 
     
    # fill the new kfold column 
    # note that, instead of targets, we use bins! 
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)): 
        data.loc[v_, 'kfold'] = f 
     
    # drop the bins column 
    data = data.drop("bins", axis=1) 
    # return dataframe with folds 
    return data 

In [4]:
# we create a sample dataset with 15000 samples  
# and 100 features and 1 target 
X, y = datasets.make_regression( 
    n_samples=15000, n_features=100, n_targets=1 
) 
 
# create a dataframe out of our numpy arrays 
df = pd.DataFrame( 
    X, 
    columns=[f"f_{i}" for i in range(X.shape[1])] 
) 
df.loc[:, "target"] = y 
 
# create folds 
df = create_folds(df) 



In [5]:
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,target,kfold
0,0.157956,1.272244,0.383481,-1.084358,2.584864,-1.395658,-2.020349,-2.334263,1.637756,-1.251275,...,-0.328475,-0.942127,-0.050029,0.620578,1.045371,0.784495,0.580707,0.710813,-93.538339,0
1,0.425656,0.880141,-0.617394,0.333615,0.259472,0.088326,-0.819337,-0.454579,0.356153,-0.557472,...,-1.274451,-0.617302,-1.006089,-0.111509,-0.049461,-0.378002,-0.272504,-1.100389,223.955088,0
2,0.732790,-1.225890,0.803999,0.630151,-0.155294,1.731972,-0.228019,-0.428697,-1.212843,-1.495409,...,-0.850111,-0.419474,0.704455,-2.476381,0.915677,-0.381198,-0.859559,1.133733,68.270615,0
3,-2.148413,-0.959946,1.167638,-0.381444,-0.441138,-0.491070,2.123925,0.036907,-1.254765,1.531039,...,-0.733710,-1.412697,1.254380,-2.375135,1.454008,-0.249748,-0.784837,0.275770,-46.191135,0
4,0.180439,-0.618900,0.690793,-0.730159,-0.031118,0.666095,0.388600,-0.803261,3.165570,0.341353,...,-0.503898,1.015217,-0.549825,1.177171,-0.006047,0.946480,0.917034,1.278427,-146.355557,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,-0.551606,0.014293,0.719035,-1.042429,0.435986,1.452370,-0.770432,0.049307,-0.986318,-0.685189,...,1.445832,-1.158087,-2.140672,-0.608979,-1.160033,-0.339236,-1.381136,-1.366617,122.573397,4
14996,0.427104,-0.304353,-1.215594,0.923588,-0.491588,0.670341,-0.872101,-0.369247,0.533079,2.198419,...,-0.237433,-1.653856,0.649376,-0.042022,0.908271,-1.071233,1.419309,0.774232,-157.745041,4
14997,-1.369883,-1.746918,-1.118974,2.181243,-1.352230,-0.166475,-0.518489,-2.451960,2.509184,-0.195788,...,-0.399592,0.508355,-2.081973,1.196690,-1.004753,-1.263057,-1.368328,1.683615,-313.757479,4
14998,1.941241,-1.123166,0.351293,1.082255,-0.913079,1.867291,-1.049594,-1.530009,1.973280,0.581435,...,0.541784,-1.333135,-0.225057,-0.866801,1.646691,-0.074402,1.475987,1.106762,495.615152,4


In [6]:
df.to_csv('create.csv')