### Import libraries

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

### Read data

In [2]:
data = pd.read_csv('winequality-white.csv', sep = ';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


### Check missing values

In [3]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### Create labels for data

In [4]:
data['goodQuality'] = [1 if x >= 7 else 0 for x in data['quality']]

Separate X and y

In [5]:
X = data.drop(['quality', 'goodQuality'], axis = 1)
y = data['goodQuality']

Scale data

In [6]:
X = preprocessing.scale(X)

Split training and test data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Fit model and predict

In [8]:
model = DecisionTreeClassifier(max_depth = 10, min_samples_split = 2, min_samples_leaf = 1)
model.fit(X_train, y_train)
pred = model.predict(X_test)

Check performance of model

In [9]:
print('CONFUSION MATRIX')
print(confusion_matrix(y_test, pred))

CONFUSION MATRIX
[[684  66]
 [100 130]]
