In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from imblearn.over_sampling import SMOTE 
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA,TruncatedSVD
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import string
import matplotlib.pyplot as plt
%matplotlib inline 


Using TensorFlow backend.


<div class="alert alert-block alert-info">
<b>Loading data:</b> We load the dataset necessary for analysis
</div>

In [2]:
path_of_input_file = 'D:\\kaggle_trials\\mlcourse\\winequality-white.csv'
df                 = pd.read_csv(path_of_input_file)
df.head(4)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
print('We can clearly see that every value is numerical and hence only scaling will be needed for preprocessing steps')

We can clearly see that every value is numerical and hence only scaling will be needed for preprocessing steps


<div class="alert alert-block alert-info">
<b>Unbalanced data:</b> We can clearly see that the data in unbalanced
</div>

In [4]:
num_labels = df['quality'].unique()
print('The number of labels are ',len(num_labels))

The number of labels are  7


In [5]:
for i in range(len(num_labels)):
    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['quality']==num_labels[i]]))
print('We dont have a balanced dataset and hence we need to perform imbalanced dataset handling')

The number of  6  labels are :-  2198
The number of  5  labels are :-  1457
The number of  7  labels are :-  880
The number of  8  labels are :-  175
The number of  4  labels are :-  163
The number of  3  labels are :-  20
The number of  9  labels are :-  5
We dont have a balanced dataset and hence we need to perform imbalanced dataset handling


<div class="alert alert-block alert-info">
<b>Preprocessing steps:</b> We preprocess the data and make the data balanced
</div>

In [46]:
cols_needed       = df.columns
columns_to_scale  = cols_needed[:-1]
scaler            = MinMaxScaler()
scaled_columns    = scaler.fit_transform(df[columns_to_scale]) 
X_processed_data  = scaled_columns
lb                = LabelEncoder()
Y                 = lb.fit_transform(df['quality'].values)

In [47]:
pca       = PCA(n_components=10)
X_reduced = pca.fit_transform(X_processed_data)

In [48]:
sm           = SMOTE(random_state=42,k_neighbors=4)
X_res, Y_res = sm.fit_resample(X_reduced, Y)

In [49]:
for i in range(len(num_labels)):
    print('The number of ', i ,' labels are :- ',
          [np.array_equal(Y_res[j],i) for j in range(len(Y_res))].count(True))

The number of  0  labels are :-  2198
The number of  1  labels are :-  2198
The number of  2  labels are :-  2198
The number of  3  labels are :-  2198
The number of  4  labels are :-  2198
The number of  5  labels are :-  2198
The number of  6  labels are :-  2198


<div class="alert alert-block alert-info">
<b>Train-test split:</b> train test split of data is performed
</div>

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)

In [51]:
decision_tree_grid = {'criterion' : hp.choice('criterion',['gini','entropy']),
                      'max_depth' : hp.choice('max_depth',range(1,150)),
                      'min_samples_split' : hp.choice('min_samples_split',range(2,30)),
                      'min_samples_leaf'  : hp.uniform('min_samples_leaf',0.1,0.5),
                      'max_features'      : hp.choice('max_features',range(1,10))
                      
                     }

In [52]:
def hyperopt_train_test(params):
    clf = DecisionTreeClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, decision_tree_grid, algo=tpe.suggest, max_evals=500, trials=trials)
best_parameters = space_eval(decision_tree_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 500/500 [00:26<00:00, 19.15it/s, best loss: -0.4014437049772785]
The best parameter tuned on training set is given by :-  {'criterion': 'gini', 'max_depth': 128, 'max_features': 6, 'min_samples_leaf': 0.10042508293788373, 'min_samples_split': 5}


In [53]:
model = DecisionTreeClassifier(**best_parameters)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=128,
                       max_features=6, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.10042508293788373,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=None, splitter='best')

In [54]:
y_pred = model.predict(X_test)

In [55]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.35      0.47      0.40       532
           1       0.60      0.34      0.43      1327
           2       0.20      0.23      0.21       644
           3       0.00      0.00      0.00         0
           4       0.31      0.31      0.31       715
           5       0.35      0.47      0.40       541
           6       0.89      0.47      0.62      1319

    accuracy                           0.38      5078
   macro avg       0.39      0.33      0.34      5078
weighted avg       0.53      0.38      0.43      5078

