In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import model_selection
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/winequality-red.csv")
df = df.sample(frac=1).reset_index(drop=True) #shuffling

pd.set_option('display.max_rows', 1000000) 

<font size=5> Random K Fold </font>

In [3]:
#########################################################
######                RANDOM K FOLD                 #####
#########################################################
kfold = model_selection.KFold(n_splits=5)

for fold, (train_idx, val_idx) in enumerate(kfold.split(X=df)):
    df.loc[val_idx, 'fold'] = fold

if not os.path.exists("./output"):
    os.makedirs("./output")
    
df.to_csv("./output/train_random_folds.csv")

<font size=5> Stratified K Fold </font>

In [4]:
#########################################################
######              Stratified K Fold               #####
#########################################################

kfold = model_selection.StratifiedKFold(n_splits=5)

for fold, (train_idx, val_idx) in enumerate(kfold.split(X=df, y=df['quality'])):
    df.loc[val_idx, 'fold'] = fold

if not os.path.exists("./output"):
    os.makedirs("./output")
df.to_csv("./output/train_stratified_folds_.csv")

<font size=5> Stratified K Fold for Regression Problems </font>

In [5]:
#########################################################
######       Stratified K Fold Regression           #####
#########################################################

kfold = model_selection.StratifiedKFold(n_splits=5)

df['target'] = np.random.normal(50000, 23000, df.shape[0])

num_bins = 1 + np.log2(df.shape[0]) ##Sturge’s rule:

df['bins'] = pd.cut(df['target'], bins=int(num_bins), labels=False)
print(f'df bin values : {df.bins.value_counts()}')


for fold, (train_idx, val_idx) in enumerate(kfold.split(X=df, y=df.bins.values)):
    df.loc[val_idx, 'fold'] = fold

if not os.path.exists("./output"):
    os.makedirs("./output")
df.to_csv("./output/train_stratified_regression_folds.csv")

df bin values : 5     380
6     370
4     277
7     222
3     139
8     109
2      57
9      27
10      9
1       7
0       2
Name: bins, dtype: int64




<font size=5> Stratified Nested Cross Validation </font>

In [6]:
df_dict = {}
outter_kfold = model_selection.StratifiedKFold(n_splits=3)

inner_kfold = model_selection.StratifiedKFold(n_splits=5)

for fold, (train_idx, val_idx) in enumerate(outter_kfold.split(X=df, y=df.bins.values)):
    df.loc[val_idx, 'outter_fold'] = fold
    
for outter_fold in df['outter_fold'].unique():
    df_temp_train = df[df['outter_fold'] != outter_fold]
    df_temp_test = df[df['outter_fold'] == outter_fold]
    
    df_temp_train.reset_index(inplace=True)
    del df_temp_train['index']
    
    df_temp_test.reset_index(inplace=True)
    del df_temp_test['index']
    
    df_temp_test['inner_fold'] = 'TEST'
    
    print(df_temp_train.index)
    
    for fold, (train_idx, val_idx) in enumerate(inner_kfold.split(X=df_temp_train, y=df_temp_train.bins.values)):
        df_temp_train.loc[val_idx, 'inner_fold'] = fold
    
    df_dict['outter_fold_' + str(outter_fold) + '_train'] = df_temp_train
    df_dict['outter_fold_' + str(outter_fold) + '_test'] = df_temp_test
        

RangeIndex(start=0, stop=1066, step=1)
RangeIndex(start=0, stop=1066, step=1)
RangeIndex(start=0, stop=1066, step=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp_test['inner_fold'] = 'TEST'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [7]:
for key, df in df_dict.items():
    print(f'outter_fold: {key}')
    print(df[['inner_fold', 'bins']].groupby('inner_fold')['bins'].value_counts())

outter_fold: outter_fold_0.0_train
inner_fold  bins
0.0         5       51
            6       50
            4       37
            7       29
            3       18
            8       14
            2        8
            9        4
            0        1
            1        1
            10       1
1.0         5       51
            6       50
            4       36
            7       29
            3       19
            8       14
            2        8
            9        3
            10       2
            0        1
2.0         5       51
            6       49
            4       37
            7       30
            3       19
            8       15
            2        7
            9        3
            1        1
            10       1
3.0         5       50
            6       49
            4       37
            7       30
            3       19
            8       15
            2        7
            9        4
            1        1
            10       1
4.0  

In [8]:
df_dict.keys()

dict_keys(['outter_fold_0.0_train', 'outter_fold_0.0_test', 'outter_fold_1.0_train', 'outter_fold_1.0_test', 'outter_fold_2.0_train', 'outter_fold_2.0_test'])