# Modeling

In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv('winequality-red-scaled2.csv')

In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,quality,alcohol_low,alcohol_medium,alcohol_high
0,-0.524431,0.932,-1.393258,-0.461157,-0.245623,-0.468554,-0.38405,0.584003,1.291872,-0.578561,-0.395722,0.792177,-0.730657,-0.193403
1,-0.294063,1.9158,-1.393258,0.056665,0.200094,0.872003,0.604073,0.048737,-0.708395,0.124822,-0.395722,0.792177,-0.730657,-0.193403
2,-0.294063,1.259934,-1.188617,-0.165259,0.078535,-0.085537,0.214813,0.15579,-0.321247,-0.051024,-0.395722,0.792177,-0.730657,-0.193403
3,1.664067,-1.363534,1.471711,-0.461157,-0.265883,0.105971,0.394471,0.691057,-0.966495,-0.461331,-0.395722,0.792177,-0.730657,-0.193403
4,-0.524431,0.713378,-1.393258,-0.535132,-0.265883,-0.277045,-0.204391,0.584003,1.291872,-0.578561,-0.395722,0.792177,-0.730657,-0.193403


In [6]:
df['alcohol_high'].value_counts()

-0.193403    1310
 5.170560      49
Name: alcohol_high, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359 entries, 0 to 1358
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1359 non-null   float64
 1   volatile acidity      1359 non-null   float64
 2   citric acid           1359 non-null   float64
 3   residual sugar        1359 non-null   float64
 4   chlorides             1359 non-null   float64
 5   free sulfur dioxide   1359 non-null   float64
 6   total sulfur dioxide  1359 non-null   float64
 7   density               1359 non-null   float64
 8   pH                    1359 non-null   float64
 9   sulphates             1359 non-null   float64
 10  quality               1359 non-null   float64
 11  alcohol_low           1359 non-null   float64
 12  alcohol_medium        1359 non-null   float64
 13  alcohol_high          1359 non-null   float64
dtypes: float64(14)
memory usage: 148.8 KB


In [8]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
df['quality'] = labelencoder_y.fit_transform(df['quality'])

In [11]:
from collections import Counter
print(Counter(y))

Counter({0: 1175, 1: 1175})


# Train Test Split

In [9]:
X = df.drop('quality', axis = 1).values
y = df['quality'].values.reshape(-1,1)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)

Shape of X_train:  (1880, 13)
Shape of X_test:  (470, 13)
Shape of y_train:  (1880,)
Shape of y_test (470,)


# SMOTE for Balancing Data

In [10]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

# 1. Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(C=1, fit_intercept=True, max_iter=1000, penalty = 'l2',solver='liblinear')
classifier_lr.fit(X_train, y_train.ravel())

LogisticRegression(C=1, max_iter=1000, solver='liblinear')

In [14]:
# Predicting Cross Validation Score

cv_lr = cross_val_score(estimator = classifier_lr, X = X_train, y = y_train.ravel(), cv = 10)
print("CV: ", cv_lr.mean())

y_pred_lr_train = classifier_lr.predict(X_train)
accuracy_lr_train = accuracy_score(y_train, y_pred_lr_train)
print("Training set: ", accuracy_lr_train)

y_pred_lr_test = classifier_lr.predict(X_test)
accuracy_lr_test = accuracy_score(y_test, y_pred_lr_test)
print("Test set: ", accuracy_lr_test)

CV:  0.8069148936170214
Training set:  0.8095744680851064
Test set:  0.7808510638297872


In [15]:
confusion_matrix(y_test, y_pred_lr_test)

array([[172,  60],
       [ 43, 195]], dtype=int64)

In [16]:
tp_lr = confusion_matrix(y_test, y_pred_lr_test)[0,0]
fp_lr = confusion_matrix(y_test, y_pred_lr_test)[0,1]
tn_lr = confusion_matrix(y_test, y_pred_lr_test)[1,1]
fn_lr = confusion_matrix(y_test, y_pred_lr_test)[1,0]

# 2. Random Forest Classification

In [17]:
# Fitting Random Forest Classification to the Training se
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(criterion = 'entropy', max_features = 4, n_estimators = 800, random_state=42)
classifier_rf.fit(X_train, y_train.ravel())

RandomForestClassifier(criterion='entropy', max_features=4, n_estimators=800,
                       random_state=42)

In [18]:
cv_rf = cross_val_score(estimator = classifier_rf, X = X_train, y = y_train.ravel(), cv = 10)
print("CV: ", cv_rf.mean())

y_pred_rf_train = classifier_rf.predict(X_train)
accuracy_rf_train = accuracy_score(y_train, y_pred_rf_train)
print("Training set: ", accuracy_rf_train)

y_pred_rf_test = classifier_rf.predict(X_test)
accuracy_rf_test = accuracy_score(y_test, y_pred_rf_test)
print("Test set: ", accuracy_rf_test)

CV:  0.9154255319148936
Training set:  1.0
Test set:  0.9340425531914893


In [19]:
confusion_matrix(y_test, y_pred_rf_test)

array([[207,  25],
       [  6, 232]], dtype=int64)

In [20]:
tp_rf = confusion_matrix(y_test, y_pred_rf_test)[0,0]
fp_rf = confusion_matrix(y_test, y_pred_rf_test)[0,1]
tn_rf = confusion_matrix(y_test, y_pred_rf_test)[1,1]
fn_rf = confusion_matrix(y_test, y_pred_rf_test)[1,0]

# 3. Xgboost Classifier

In [21]:
# ! pip3 install xgboost

In [22]:
import xgboost as xgb

In [23]:
wine_dmatrix = xgb.DMatrix(data=X_train,label=y_train.ravel())

In [24]:
import decimal

def float_range(start, stop, step):
  while start <= stop:
    yield float(start)
    start += decimal.Decimal(step)

In [32]:
gbm_param_grid = {'learning_rate': [0.05,0.1,0.15,0.20,0.25],
'max_depth': [3,4,5,6,8,10,12,15],
'min_child_weight':[1,3,5,7],
'gamma':[0.0,0.1,0.2,0.3,0.4],
'colsample_bytree':[0.3,0.4,0.5,0.7],
'n_estimators':[100,200,300]}

In [33]:
from sklearn.model_selection import RandomizedSearchCV
gbm = xgb.XGBClassifier(use_label_encoder=False)

In [34]:
grid_mse = RandomizedSearchCV(gbm,param_distributions=gbm_param_grid,n_iter=5,scoring='roc_auc',n_jobs=-1, cv=5, verbose=3)
grid_mse.fit(X, y)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    2.7s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    2.9s finished


Best parameters found:  {'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 12, 'learning_rate': 0.25, 'gamma': 0.2, 'colsample_bytree': 0.7}
Lowest RMSE found:  0.9883852422274247


In [35]:
gbm_best = xgb.XGBClassifier(**grid_mse.best_params_)

In [36]:
gbm_best.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.25, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
y_pred_xgb = gbm_best.predict(X_test)

In [38]:
accuracy = accuracy_score(y_test, y_pred_xgb)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 93.40%
