# Simple kfold 

In [None]:
# import pandas and model selection module of scikit-learn
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    #Training data is in a CSV file called train.csv
    df = pd.read_csv("train.csv")
    
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    
    #the next step is to randomize the rows of data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # initiate the kfold class from model_selection
    kf = model_selection.KFold(n_splits=5)
    
    # fill the new kfold column
    for fold, (trn_,val_) in enumerate(kf.split(X=df)):
        df.loc[val_,'k_fold'] = fold
        
    # save the new csv with kfold column
    df.to_csv("train_folds.csv",index=False)

# Stratified kfold

In [None]:
# import pandas and model selection module of scikit-learn
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    #Training data is in a CSV file called train.csv
    df = pd.read_csv("train.csv")
    
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    
    #the next step is to randomize the rows of data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # initiate the kfold class from model_selection
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
        df.loc[v_,'kfold'] = f
        
        
    # save the new csv with kfold column
    df.to_csv("train_folds.csv",index=False)

# Stratified kfold for regression data

In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn import model_selection

def create_folds(data):
    # we create a new column called k-fold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of data
    data = data.sample(frac=1).reset_index(drop=True)
    
    # calculate the number of bins by Sturge`s rule
    # I take th floor of the value. You can also just take the round
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:,"bins"] = pd.cut(
        data['target'], bins=num_bins, labels = False
    )
    
    # initiate the kfold class from the model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    # note that instead of targets, we use bins
    for f,(t_,v_) in enumerate(kf.split(X=data,y=data.bins.values)):
        data.loc[v_,'kfold'] = f
        
    # drop the bins column
    data = data.drop("bins",axis=1)
    #return the datafram with folds
    return data


if __name__ == "__main__":
    # we create a sample with 15000 samples and 100 features and 1 target
    X, y = datasets.make_regression(
        n_samples=15000,n_features=100, n_targets=1
    )
    
    # create a dataframe out of our numpy arrays
    df = pd.DataFrame(
        X,
        columns=[f"f_{i}" for i in range(X.shape[1])]
    )
    df.loc[:,"target"] = y
    
    # create folds
    df = create_folds(df)

