# Modeling

In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score ,roc_curve,auc,f1_score, precision_recall_curve

from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm 

In [4]:
df = pd.read_csv('winequality-red-scaled2.csv')

In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,quality,alcohol_low,alcohol_medium,alcohol_high
0,-0.526823,0.967437,-1.396766,-0.468591,-0.286544,-0.472221,-0.389808,0.602517,1.336203,-0.58636,-0.395722,1.06463,-0.881176,-0.32246
1,-0.295502,1.9801,-1.396766,-0.08505,0.023186,0.885977,0.615217,0.052556,-0.727179,0.108276,-0.395722,1.06463,-0.881176,-0.32246
2,-0.295502,1.304992,-1.191446,-0.249425,-0.061286,-0.084165,0.219298,0.162548,-0.327815,-0.065383,-0.395722,1.06463,-0.881176,-0.32246
3,1.670732,-1.395443,1.477714,-0.468591,-0.300623,0.109864,0.40203,0.712509,-0.993422,-0.470587,-0.395722,1.06463,-0.881176,-0.32246
4,-0.526823,0.742401,-1.396766,-0.523383,-0.300623,-0.278193,-0.207076,0.602517,1.336203,-0.58636,-0.395722,1.06463,-0.881176,-0.32246


In [6]:
df['alcohol_high'].value_counts()

-0.322460    1231
 3.101159     128
Name: alcohol_high, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359 entries, 0 to 1358
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1359 non-null   float64
 1   volatile acidity      1359 non-null   float64
 2   citric acid           1359 non-null   float64
 3   residual sugar        1359 non-null   float64
 4   chlorides             1359 non-null   float64
 5   free sulfur dioxide   1359 non-null   float64
 6   total sulfur dioxide  1359 non-null   float64
 7   density               1359 non-null   float64
 8   pH                    1359 non-null   float64
 9   sulphates             1359 non-null   float64
 10  quality               1359 non-null   float64
 11  alcohol_low           1359 non-null   float64
 12  alcohol_medium        1359 non-null   float64
 13  alcohol_high          1359 non-null   float64
dtypes: float64(14)
memory usage: 148.8 KB


In [8]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,quality,alcohol_low,alcohol_medium,alcohol_high
0,-0.526823,0.967437,-1.396766,-0.468591,-0.286544,-0.472221,-0.389808,0.602517,1.336203,-0.58636,-0.395722,1.06463,-0.881176,-0.32246
1,-0.295502,1.9801,-1.396766,-0.08505,0.023186,0.885977,0.615217,0.052556,-0.727179,0.108276,-0.395722,1.06463,-0.881176,-0.32246
2,-0.295502,1.304992,-1.191446,-0.249425,-0.061286,-0.084165,0.219298,0.162548,-0.327815,-0.065383,-0.395722,1.06463,-0.881176,-0.32246
3,1.670732,-1.395443,1.477714,-0.468591,-0.300623,0.109864,0.40203,0.712509,-0.993422,-0.470587,-0.395722,1.06463,-0.881176,-0.32246
4,-0.526823,0.742401,-1.396766,-0.523383,-0.300623,-0.278193,-0.207076,0.602517,1.336203,-0.58636,-0.395722,1.06463,-0.881176,-0.32246


In [9]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
df['quality'] = labelencoder_y.fit_transform(df['quality'])

# Train Test Split

In [10]:
X = df.drop('quality', axis = 1).values
y = df['quality'].values.reshape(-1,1)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)

Shape of X_train:  (1087, 13)
Shape of X_test:  (272, 13)
Shape of y_train:  (1087, 1)
Shape of y_test (272, 1)


In [12]:
pd.DataFrame(y_train).value_counts()

0    937
1    150
dtype: int64

# SMOTE for Balancing Data

In [13]:
oversample = SMOTE(sampling_strategy=0.3)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [14]:
from collections import Counter
print(Counter(y_train))

Counter({0: 937, 1: 281})


# 1. Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(C=1, fit_intercept=True, max_iter=1000, penalty = 'l2',solver='liblinear')
classifier_lr.fit(X_train, y_train.ravel())

LogisticRegression(C=1, max_iter=1000, solver='liblinear')

In [16]:
# Predicting Cross Validation Score

cv_lr = cross_val_score(estimator = classifier_lr, X = X_train, y = y_train.ravel(), cv = 10)
print("CV: ", cv_lr.mean())

y_pred_lr_train = classifier_lr.predict(X_train)
accuracy_lr_train = accuracy_score(y_train, y_pred_lr_train)
print("Training set: ", accuracy_lr_train)

y_pred_lr_test = classifier_lr.predict(X_test)
accuracy_lr_test = accuracy_score(y_test, y_pred_lr_test)
print("Test set: ", accuracy_lr_test)

CV:  0.8193740685543964
Training set:  0.8226600985221675
Test set:  0.8786764705882353


In [17]:
confusion_matrix(y_test, y_pred_lr_test)

array([[219,  19],
       [ 14,  20]], dtype=int64)

# 2. Random Forest Classification

In [18]:
# Fitting Random Forest Classification to the Training se
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(criterion = 'entropy', max_features = 4, n_estimators = 800, random_state=42)
classifier_rf.fit(X_train, y_train.ravel())

RandomForestClassifier(criterion='entropy', max_features=4, n_estimators=800,
                       random_state=42)

In [None]:
cv_rf = cross_val_score(estimator = classifier_rf, X = X_train, y = y_train.ravel(), cv = 10)
print("CV: ", cv_rf.mean())

y_pred_rf_train = classifier_rf.predict(X_train)
accuracy_rf_train = accuracy_score(y_train, y_pred_rf_train)
print("Training set: ", accuracy_rf_train)

y_pred_rf_test = classifier_rf.predict(X_test)
accuracy_rf_test = accuracy_score(y_test, y_pred_rf_test)
print("Test set: ", accuracy_rf_test)

In [None]:
confusion_matrix(y_test, y_pred_rf_test)

In [None]:
print("Training set accuracy : {}\nConfusion matrix :\n {}\nFull Report :\n{}\nroc_auc_score : {}".format(accuracy_score(y_pred_rf_test,y_test),confusion_matrix(y_pred_rf_test,y_test),classification_report(y_pred_rf_test,y_test),roc_auc_score(y_pred_rf_test,y_test)))

# 3. Xgboost Classifier

In [None]:
# ! pip3 install xgboost

In [None]:
import xgboost as xgb

In [None]:
wine_dmatrix = xgb.DMatrix(data=X_train,label=y_train.ravel())

In [None]:
import decimal

def float_range(start, stop, step):
  while start <= stop:
    yield float(start)
    start += decimal.Decimal(step)

In [None]:
gbm_param_grid = {'learning_rate': [0.05,0.1,0.15,0.20,0.25],
'max_depth': [3,4,5,6,8,10,12,15],
'min_child_weight':[1,3,5,7],
'gamma':[0.0,0.1,0.2,0.3,0.4],
'colsample_bytree':[0.3,0.4,0.5,0.7],
'n_estimators':[100,200,300]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
gbm = xgb.XGBClassifier(use_label_encoder=False)

In [None]:
grid_mse = RandomizedSearchCV(gbm,param_distributions=gbm_param_grid,n_iter=5,scoring='roc_auc',n_jobs=-1, cv=5, verbose=3)
grid_mse.fit(X, y)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

In [None]:
gbm_best = xgb.XGBClassifier(**grid_mse.best_params_)

In [None]:
gbm_best.fit(X_train,y_train)

In [None]:
y_pred_xgb = gbm_best.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_xgb)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
confusion_matrix(y_test, y_pred_xgb)

In [None]:
print("Training set accuracy : {}\nConfusion matrix :\n {}\nFull Report :\n{}\nroc_auc_score : {}".format(accuracy_score(y_pred_xgb,y_test),confusion_matrix(y_pred_xgb,y_test),classification_report(y_pred_xgb,y_test),roc_auc_score(y_pred_xgb,y_test)))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,y_pred_xgb)
roc_auc = roc_auc_score(y_test,y_pred_xgb)
plt.figure(figsize  = (15,6))
plt.plot(fpr, tpr, label = 'Sensitivity = %0.3f'% roc_auc )
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('FALSE POSITIVE RATE')
plt.ylabel('TRUE POSITIVE RATE')
plt.title('ROC curve for test data')
plt.legend(loc="lower right")
plt.show()

In [None]:
########### ROC  AUC curve   ######
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]

# predict probabilities
lr_probs = gbm_best.predict_proba(X_test)

# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('XGBOOST: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
plt.figure(figsize = (15,6))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='XGB')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.title("ROC_AUC curve for test data")
plt.show()  

In [None]:
# Precision Recall curve
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1, lr_auc = f1_score(y_test, y_pred_xgb), auc(lr_recall, lr_precision)
no_skill = len(y_test[y_test==1]) / len(y_test)
# summarize scores
print('XGBOOST: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
plt.figure(figsize=(15,6))
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.plot(lr_recall, lr_precision, marker='.', label='XGB')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()