# Building Wine Classifier Model with PyCaret

In [20]:
import pandas as pd
import numpy as np

In [21]:
wine_df = pd.read_csv('wine_qual.csv', sep=';')

In [22]:
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,5.0,0.38,0.01,1.6,0.048,26.0,60.0,0.99084,3.7,0.75,14.0,6
1,6.4,0.41,0.01,6.1,0.048,20.0,70.0,0.99362,3.19,0.42,10.0,5
2,6.4,0.41,0.01,6.1,0.048,20.0,70.0,0.99362,3.19,0.42,10.0,5
3,6.7,0.41,0.01,2.8,0.048,39.0,137.0,0.9942,3.24,0.35,9.5,5
4,5.1,0.42,0.01,1.5,0.017,25.0,102.0,0.9894,3.38,0.36,12.3,7


## Transform quality feature into binary (Good or Bad)

In [23]:
wine_df.quality = np.where(wine_df.quality >= 6,'Good', 'Bad')

In [24]:
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,5.0,0.38,0.01,1.6,0.048,26.0,60.0,0.99084,3.7,0.75,14.0,Good
1,6.4,0.41,0.01,6.1,0.048,20.0,70.0,0.99362,3.19,0.42,10.0,Bad
2,6.4,0.41,0.01,6.1,0.048,20.0,70.0,0.99362,3.19,0.42,10.0,Bad
3,6.7,0.41,0.01,2.8,0.048,39.0,137.0,0.9942,3.24,0.35,9.5,Bad
4,5.1,0.42,0.01,1.5,0.017,25.0,102.0,0.9894,3.38,0.36,12.3,Good


In [25]:
wine_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                  object
quality                  object
dtype: object

## Compare Model with Default Setup

In [8]:
from pycaret.classification import *

  import mlflow


In [9]:
exp_clf101 = setup(data = wine_df, target = 'quality', session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,quality
2,Target Type,Binary
3,Label Encoded,"Bad: 0, Good: 1"
4,Original Data,"(6313, 12)"
5,Missing Values,False
6,Numeric Features,10
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [10]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.816,0.8912,0.8881,0.8304,0.8582,0.5972,0.6002,0.723
et,Extra Trees Classifier,0.8092,0.8892,0.8809,0.8263,0.8527,0.5828,0.5853,0.857
lightgbm,Light Gradient Boosting Machine,0.8038,0.861,0.869,0.8271,0.8474,0.5732,0.575,0.171
gbc,Gradient Boosting Classifier,0.7665,0.8312,0.8635,0.7855,0.8226,0.4832,0.4881,0.809
dt,Decision Tree Classifier,0.7495,0.7324,0.7996,0.8009,0.8,0.4647,0.4653,0.082
ada,Ada Boost Classifier,0.7352,0.7948,0.8314,0.7663,0.7974,0.4171,0.4203,0.338
ridge,Ridge Classifier,0.733,0.0,0.8296,0.7646,0.7955,0.4123,0.4158,0.081
lda,Linear Discriminant Analysis,0.7327,0.796,0.8242,0.767,0.7944,0.4139,0.4167,0.193
lr,Logistic Regression,0.7307,0.7962,0.835,0.7594,0.7953,0.4043,0.4088,2.771
knn,K Neighbors Classifier,0.6454,0.6572,0.7715,0.696,0.7316,0.2129,0.2157,0.225


## Compare Models with Tuned Setup

In [11]:
exp_clf102 = setup(data = wine_df, target = 'quality', session_id=123,
                  normalize = True, 
                  transformation = True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,quality
2,Target Type,Binary
3,Label Encoded,"Bad: 0, Good: 1"
4,Original Data,"(6313, 12)"
5,Missing Values,False
6,Numeric Features,10
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [12]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8194,0.8918,0.8964,0.8295,0.8615,0.603,0.6073,0.711
et,Extra Trees Classifier,0.807,0.8907,0.8845,0.8215,0.8517,0.5763,0.5797,0.928
lightgbm,Light Gradient Boosting Machine,0.8011,0.8619,0.8668,0.825,0.8452,0.5673,0.5691,0.16
gbc,Gradient Boosting Classifier,0.7667,0.8306,0.8632,0.7861,0.8227,0.4837,0.4885,0.809
dt,Decision Tree Classifier,0.7502,0.7319,0.804,0.7991,0.8012,0.4648,0.4655,0.069
lr,Logistic Regression,0.7425,0.8012,0.8339,0.7731,0.8023,0.4345,0.4374,0.221
lda,Linear Discriminant Analysis,0.7422,0.8017,0.8195,0.7803,0.7993,0.4397,0.4412,0.189
ridge,Ridge Classifier,0.7407,0.0,0.8253,0.7754,0.7994,0.4335,0.4358,0.04
ada,Ada Boost Classifier,0.7355,0.795,0.8318,0.7664,0.7976,0.4175,0.4208,0.313
knn,K Neighbors Classifier,0.7244,0.7686,0.8137,0.7627,0.7873,0.397,0.3989,0.495


## Build Extra Tree Classifier

In [13]:
et_model = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8235,0.8954,0.9206,0.8199,0.8673,0.6064,0.6156
1,0.819,0.8976,0.8917,0.8316,0.8606,0.6035,0.6064
2,0.8235,0.8862,0.9134,0.8241,0.8664,0.6084,0.6155
3,0.8258,0.9002,0.8736,0.8521,0.8627,0.6245,0.6248
4,0.8258,0.9029,0.9025,0.8333,0.8666,0.6169,0.621
5,0.8348,0.9111,0.9134,0.8377,0.8739,0.6359,0.6409
6,0.81,0.8754,0.8953,0.8185,0.8552,0.5805,0.5854
7,0.8167,0.877,0.8736,0.8403,0.8566,0.603,0.6039
8,0.8167,0.8984,0.8989,0.8245,0.8601,0.596,0.6006
9,0.7982,0.874,0.8809,0.8133,0.8458,0.5553,0.559


## Evaluate Model

In [14]:
evaluate_model(et_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## Predict Test Set 

In [15]:
predict_model(et_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8273,0.8957,0.8978,0.8496,0.873,0.6041,0.6063


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,alcohol_9.6,alcohol_9.7,alcohol_9.733.333.333,alcohol_9.75,alcohol_9.8,alcohol_9.9,alcohol_9.95,quality,Label,Score
0,-0.071308,0.057563,-0.126697,1.212698,-0.763646,-1.853874,-1.580655,-0.227286,-0.915934,-1.982798,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good,Good,0.62
1,-0.647117,-0.548850,-0.361317,-1.055695,-0.527818,0.378900,-0.240729,-0.306506,-0.493224,-2.254758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good,Good,0.94
2,-1.185126,-1.444092,1.176645,-1.310959,-1.018251,-0.162722,-0.223021,-0.222612,0.039912,-0.775293,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Good,Good,0.70
3,0.816339,1.451921,0.523586,-0.053698,-1.018251,-0.435059,0.061862,-0.231362,0.531771,1.055915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good,Good,0.75
4,-0.288311,-0.548850,-0.126697,-0.070876,-0.380822,3.104846,1.097813,-0.225823,-1.133397,0.629255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good,Good,0.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,2.459856,1.635418,1.239182,-1.105445,0.958305,-1.682425,-1.614985,-0.052351,-0.915934,0.687927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bad,Bad,0.61
1890,1.670552,1.385184,-1.195310,-0.426846,1.322553,-0.577732,-1.149815,6.688702,-0.424521,0.744855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bad,Bad,0.78
1891,-0.647117,-0.437340,1.424331,0.654264,-0.682971,1.495593,2.117936,-0.121277,-1.206711,-0.775293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bad,Bad,0.62
1892,-0.177543,-0.437340,0.172061,0.200280,-1.292041,0.323590,0.277520,-0.202228,0.039912,-1.226984,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Good,Good,0.90


## Save Model

In [17]:
save_model(et_model, model_name = 'extra_tree_model_1')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='quality',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='auto',
                                         max_l