In [32]:
import numpy as np
from sklearn.model_selection import KFold

In [33]:
import pandas as pd

red_wine = pd.read_csv('../data/winequality-red.csv', sep = ';')
white_wine = pd.read_csv('../data/winequality-white.csv', sep = ';')

In [34]:
red_wine['color'] = 1.
white_wine['color'] = 0.

wine = pd.concat([red_wine, white_wine])

In [35]:
wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis =1)
y = wine['taste']

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 13)


In [38]:
wine_tree = DecisionTreeClassifier(max_depth = 2, random_state= 13)
wine_tree.fit(X_train, y_train)

In [39]:
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('train acc :' ,accuracy_score(y_train, y_pred_tr))
print('test acc :' ,accuracy_score(y_test, y_pred_test))


train acc : 0.7294593034442948
test acc : 0.7161538461538461


In [40]:
Kfold = KFold(n_splits =5)
wine_tree_cv = DecisionTreeClassifier(max_depth =2, random_state=13)

In [41]:
for train_idx, test_idx in Kfold.split(X):
    print(len(train_idx), len(test_idx))    

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


In [42]:
cv_accuracy = []

for train_idx, test_idx in Kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))
    
cv_accuracy
    
    

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

In [43]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skfold = StratifiedKFold(n_splits =5)
wine_tree_cv = DecisionTreeClassifier(max_depth =2, random_state=13)

cross_val_score(wine_tree_cv , X, y, scoring = None, cv= skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [44]:
from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, X, y, scoring = None, cv=skfold, return_train_score = True)

{'fit_time': array([0.0059979 , 0.00627494, 0.00539804, 0.00521684, 0.00492406]),
 'score_time': array([0.00143027, 0.0014782 , 0.00124884, 0.0011301 , 0.00109839]),
 'test_score': array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595]),
 'train_score': array([0.74773908, 0.74696941, 0.74317045, 0.73509042, 0.73258946])}