# 02: Cross-validation

## p23 implementation of cross-validation

In [6]:
# import pandas and model_selection module of scikit-learn 
import pandas as pd 
from sklearn import model_selection 

In [7]:
# Training data is in a CSV file called train.csv 
df = pd.read_csv("./data/train.csv") 

In [8]:
df

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,6.1,0.560,0.00,2.2,0.079,6.0,9.0,0.99480,3.59,0.54,11.5,3
1,1,7.9,0.430,0.21,1.6,0.106,10.0,37.0,0.99660,3.17,0.91,9.5,2
2,2,10.5,0.390,0.46,2.2,0.075,14.0,27.0,0.99598,3.06,0.84,11.4,3
3,3,7.2,0.340,0.24,2.0,0.071,30.0,52.0,0.99576,3.44,0.58,10.1,2
4,4,6.5,0.460,0.14,2.4,0.114,9.0,37.0,0.99732,3.66,0.65,9.8,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,7.1,0.590,0.01,2.5,0.077,20.0,85.0,0.99746,3.55,0.59,9.8,2
996,996,5.6,0.850,0.05,1.4,0.045,12.0,88.0,0.99240,3.56,0.82,12.9,5
997,997,6.7,0.670,0.02,1.9,0.061,26.0,42.0,0.99489,3.39,0.82,10.9,3
998,998,9.9,0.440,0.46,2.2,0.091,10.0,41.0,0.99638,3.18,0.69,11.9,3


In [9]:
# we create a new column called kfold and fill it with -1 
df["kfold"] = -1 

In [10]:
df

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,kfold
0,0,6.1,0.560,0.00,2.2,0.079,6.0,9.0,0.99480,3.59,0.54,11.5,3,-1
1,1,7.9,0.430,0.21,1.6,0.106,10.0,37.0,0.99660,3.17,0.91,9.5,2,-1
2,2,10.5,0.390,0.46,2.2,0.075,14.0,27.0,0.99598,3.06,0.84,11.4,3,-1
3,3,7.2,0.340,0.24,2.0,0.071,30.0,52.0,0.99576,3.44,0.58,10.1,2,-1
4,4,6.5,0.460,0.14,2.4,0.114,9.0,37.0,0.99732,3.66,0.65,9.8,2,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,7.1,0.590,0.01,2.5,0.077,20.0,85.0,0.99746,3.55,0.59,9.8,2,-1
996,996,5.6,0.850,0.05,1.4,0.045,12.0,88.0,0.99240,3.56,0.82,12.9,5,-1
997,997,6.7,0.670,0.02,1.9,0.061,26.0,42.0,0.99489,3.39,0.82,10.9,3,-1
998,998,9.9,0.440,0.46,2.2,0.091,10.0,41.0,0.99638,3.18,0.69,11.9,3,-1


In [11]:
# the next step is to randomize the rows of the data 
df = df.sample(frac=1).reset_index(drop=True) 

In [12]:
df

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,kfold
0,903,7.0,0.620,0.10,1.4,0.071,27.0,63.0,0.99600,3.28,0.61,9.2,2,-1
1,668,8.2,0.700,0.23,2.0,0.099,14.0,81.0,0.99730,3.19,0.70,9.4,2,-1
2,64,8.3,0.625,0.20,1.5,0.080,27.0,119.0,0.99720,3.16,1.12,9.1,1,-1
3,349,11.0,0.200,0.48,2.0,0.343,6.0,18.0,0.99790,3.30,0.71,10.5,2,-1
4,15,6.2,0.450,0.20,1.6,0.069,3.0,15.0,0.99580,3.41,0.56,9.2,2,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,873,7.4,0.640,0.07,1.8,0.100,8.0,23.0,0.99610,3.30,0.58,9.6,2,-1
996,660,6.8,0.500,0.11,1.5,0.075,16.0,49.0,0.99545,3.36,0.79,9.5,2,-1
997,930,7.1,0.750,0.01,2.2,0.059,11.0,18.0,0.99242,3.39,0.40,12.8,3,-1
998,329,6.2,0.460,0.17,1.6,0.073,7.0,11.0,0.99425,3.61,0.54,11.4,2,-1


In [13]:
# initiate the kfold class from model_selection module 
kf = model_selection.KFold(n_splits=5) 

In [14]:
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [15]:
# fill the new kfold column 
for fold, (trn_, val_) in enumerate(kf.split(X = df)): 
    df.loc[val_, 'kfold'] = fold 

In [16]:
df

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,kfold
0,903,7.0,0.620,0.10,1.4,0.071,27.0,63.0,0.99600,3.28,0.61,9.2,2,0
1,668,8.2,0.700,0.23,2.0,0.099,14.0,81.0,0.99730,3.19,0.70,9.4,2,0
2,64,8.3,0.625,0.20,1.5,0.080,27.0,119.0,0.99720,3.16,1.12,9.1,1,0
3,349,11.0,0.200,0.48,2.0,0.343,6.0,18.0,0.99790,3.30,0.71,10.5,2,0
4,15,6.2,0.450,0.20,1.6,0.069,3.0,15.0,0.99580,3.41,0.56,9.2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,873,7.4,0.640,0.07,1.8,0.100,8.0,23.0,0.99610,3.30,0.58,9.6,2,4
996,660,6.8,0.500,0.11,1.5,0.075,16.0,49.0,0.99545,3.36,0.79,9.5,2,4
997,930,7.1,0.750,0.01,2.2,0.059,11.0,18.0,0.99242,3.39,0.40,12.8,3,4
998,329,6.2,0.460,0.17,1.6,0.073,7.0,11.0,0.99425,3.61,0.54,11.4,2,4


In [18]:
# save the new csv with kfold column  
df.to_csv("./data/train_folds.csv", index = False) 

# Execution by a module (the case to use "if_name main")

In [20]:
%run -i k-fold-cross-validation.py

In [21]:
%run -i StratifiedKFold.py