In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, auc, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import seaborn as sns
cwd = os.getcwd()


red = pd.read_csv(cwd + "\\winequality-red.csv", delimiter = ";")
red['wine'] = "red"
white = pd.read_csv(cwd + "\\winequality-white.csv", delimiter = ";")
white['wine'] = "white"


df = red.append(white)
df["wine"].replace({"red": "0", "white": "1"}, inplace=True) #replace string values with int values
df


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,1
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,1
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1


In [10]:
def DecTreeOptimizer(X, y):
    print('**We will optimize the hyper-parameters of a Decision Tree model using Grid Search in Python**\n')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    #Creating the objects and combining them into a pipeline

    std_slc = StandardScaler()
    dec_tree = tree.DecisionTreeClassifier()
    pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('dec_tree', dec_tree)])

    # Creating Parameter Space

    criterion = ['gini', 'entropy']
    max_depth = list(range(1,10))

    parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth)
                
    # Creating a grid search object
    tree_GS = GridSearchCV(pipe, parameters, n_jobs=-1, cv=5, verbose = 1)

    # Fitting the grid search
    tree_GS = tree_GS.fit(X_train, y_train)

    #Prediction and scores
    y_pred = tree_GS.predict(X_test)

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("Best score: " +  str(tree_GS.best_score_))\
    #print("ROC AUC SCORE:" + str(roc_auc_score(y_test, y_pred)))
    print("F1 Score: {}".format(f1_score(y_test,y_pred, average = 'weighted')))
    export_graphviz(
        tree_GS.best_estimator_[1],
        out_file=("dec_tree.dot"),
        feature_names=X.columns,
        class_names=y.unique().astype(str),
        filled=True)

DecTreeOptimizer(X = df.drop(['quality'], axis=1), y = df.quality)




**We will optimize the hyper-parameters of a Decision Tree model using Grid Search in Python**

Fitting 5 folds for each of 18 candidates, totalling 90 fits




[[  0   0   3   1   0   0   0]
 [  0   6  26  14   1   1   0]
 [  0   4 281 140   8   2   0]
 [  0   8 139 356  74   0   0]
 [  0   0  16 110  74   3   0]
 [  0   0   1  15  12   4   0]
 [  0   0   0   0   1   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.33      0.12      0.18        48
           5       0.60      0.65      0.62       435
           6       0.56      0.62      0.59       577
           7       0.44      0.36      0.40       203
           8       0.40      0.12      0.19        32
           9       0.00      0.00      0.00         1

    accuracy                           0.55      1300
   macro avg       0.33      0.27      0.28      1300
weighted avg       0.54      0.55      0.54      1300

Best score: 0.5451258606648405
F1 Score: 0.5426042577881675


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
y = df.quality
X = df.drop(['quality'], axis=1)
y.unique().astype(str)


array(['5', '6', '7', '4', '8', '3', '9'], dtype='<U21')