In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="1"></a> <br>
# 1. Import Necessary Libraries

In [204]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

<a id="1"></a> <br>
# 2. Import Dataset

In [205]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

<a id="1"></a> <br>
# 3. Exploratory Data Analysis

In [206]:
data.sample(5)

In [207]:
# checking missing values:
data.isnull().sum()

In [208]:
# checking nunique and type per columns:
df_nunique = pd.DataFrame([[col , data[col].nunique(),data[col].dtypes] for col in data.columns],columns=['col','nunique','type'])
df_nunique

In [209]:
plt.figure(figsize=(8,4))
sns.countplot(data=data, x ='quality')

In [210]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'fixed acidity', data = data)

In [211]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'volatile acidity', data = data)

In [212]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'citric acid', data = data)

In [213]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'residual sugar', data = data)

In [214]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'chlorides', data = data)

chloride go down as we go higher in the quality of the wine

In [215]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'free sulfur dioxide', data = data)

In [216]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = data)

In [217]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'density', data = data)

In [218]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'pH', data = data)

In [219]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'sulphates', data = data)

In [220]:
plt.figure(figsize = (8,4))
sns.barplot(x = 'quality', y = 'alcohol', data = data)

In [221]:
# Converting to two classes:
data['quality'] = data['quality'].apply(lambda x: 'bad' if x< 6.5 else 'good')

In [222]:
data.quality.value_counts()

In [223]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['quality'] = le.fit_transform(data['quality'])

In [224]:
X = data.drop('quality',axis=1)
y = data.quality

In [225]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

<a id="1"></a> <br>
# 4. Modeling

**For the modeling part we will compare 7 known algorithms, and proceed to evaluate their average accuracy by kfold cross validation procedure:**

1: Logistic Regression

2: KNN

3: SVC

4: Decision Tree

5: Random Forest

6: Extra Trees

7: Gradient Boosting

In [226]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

**Apply Cross Validation:**

In [227]:
# Define classifiers:
kf = KFold(n_splits=3, shuffle=True, random_state=42)
Classifiers = []
Classifiers.append(LogisticRegression(random_state=42))
Classifiers.append(KNeighborsClassifier())
Classifiers.append(SVC(random_state=42))
Classifiers.append(DecisionTreeClassifier(random_state=42))
Classifiers.append(RandomForestClassifier(random_state=42))
Classifiers.append(ExtraTreesClassifier(random_state=42))
Classifiers.append(GradientBoostingClassifier(random_state=42))


In [228]:
# cross_val_score
cv_resuts = []
for Classifier in Classifiers:
    cv_resuts.append(cross_val_score(Classifier,X_scaled , y, cv = kf, scoring='accuracy', n_jobs=-1))

In [229]:
# cross_val_score Mean
cv_mean = []
for r in cv_resuts:
    cv_mean.append(np.mean(r))

In [230]:
# Create DataFram from Algorithms and CrossValMeans parameter:
cv_res = pd.DataFrame({'algorithms':['LogisticRegression',
                                 'KNN',
                                 'SVC',
                                 'DecisionTree',
                                 'RandomForest',
                                 'ExtraTrees',
                                 'GradientBoosting'],'CV_mean':cv_mean}).sort_values(by='CV_mean', ascending=False)

In [231]:
cv_res

In [232]:
from sklearn.model_selection import train_test_split
# train_test_split:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [233]:
# Scaling train data
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

**Apply GridSearchCV:**

In [242]:
# define parameters for GridSearchCV:
param_grid_LR = {'penalty':['l1','l2'],
                'solver':['lbfgs','newton_cg','liblinear'],
                'C': np.logspace(-3,3,7),}

param_grid_KNN = {'n_neighbors': range(1,31),
                 'weights':['uniform','distance'],
                 'metric':['euclidean','manhattan']}

param_grid_SVC = {'C': np.logspace(-3,3,7),
              'gamma': [1,0.1,0.01,0.001],
              'kernel': ['rbf', 'poly', 'sigmoid']}

param_grid_DT = {'criterion':['gini','entropy'],
              'max_depth': np.arange(3,15),
             'max_features': ['auto', 'sqrt', 'log2']}

param_grid_RF = {'n_estimators': [100,200,300,400,500],
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth' : np.arange(3,15),
              'criterion' : ['gini', 'entropy']}

param_grid_ET = {'n_estimators': [100,200,300,400],
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth' : np.arange(3,15),
              'criterion' : ['gini', 'entropy']}

param_grid_GB = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample' : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,200,300],
                  'max_depth' : np.arange(5,15)}

In [236]:
from sklearn.model_selection import GridSearchCV

In [237]:
# GridSearchCV for Logistic Regression:
grid_LR = GridSearchCV(Classifiers[0],param_grid_LR, cv=kf, n_jobs=-1 )
grid_LR = grid_LR.fit(X_train_scaled,y_train)
print(grid_LR.best_params_)

In [238]:
# GridSearchCV for KNN:
grid_KNN= GridSearchCV(Classifiers[1],param_grid_KNN, cv=kf, n_jobs=-1 )
grid_KNN = grid_KNN.fit(X_train_scaled,y_train)
print(grid_KNN.best_params_)

In [239]:
# GridSearchCV for SVC:
grid_SVC= GridSearchCV(Classifiers[2],param_grid_SVC, cv=kf, n_jobs=-1 )
grid_SVC = grid_SVC.fit(X_train_scaled,y_train)
print(grid_SVC.best_params_)

In [240]:
# GridSearchCV for Decision Tree:
grid_DT= GridSearchCV(Classifiers[3],param_grid_DT, cv=kf, n_jobs=-1 )
grid_DT = grid_DT.fit(X_train_scaled,y_train)
print(grid_DT.best_params_)

In [241]:
# GridSearchCV for Random Forest:
grid_RF= GridSearchCV(Classifiers[4],param_grid_RF, cv=kf, n_jobs=-1 )
grid_RF = grid_RF.fit(X_train_scaled,y_train)
print(grid_RF.best_params_)

In [243]:
# GridSearchCV for Extra Trees:
grid_ET= GridSearchCV(Classifiers[5],param_grid_ET, cv=kf, n_jobs=-1 )
grid_ET = grid_ET.fit(X_train_scaled,y_train)
print(grid_ET.best_params_)

In [244]:
# GridSearchCV for Gradient Boosting:
grid_GB= GridSearchCV(Classifiers[6],param_grid_GB, cv=kf, n_jobs=-1 )
grid_GB = grid_GB.fit(X_train_scaled,y_train)
print(grid_GB.best_params_)

In [245]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

In [246]:
# Measure Function:
def measure(y_true, y_pred):
    accuracy = round(accuracy_score(y_true, y_pred),4)
    recall = round(recall_score(y_true, y_pred),4)
    precision = round(precision_score(y_true, y_pred),4)
    f1 = round(f1_score(y_true, y_pred),4)
    return pd.Series({'accuracy_score':accuracy,
                     'recall_score':recall,
                     'precision_score':precision,
                     'f1_score':f1})

In [247]:
# Create LR Model:
model_LR = LogisticRegression(C= 0.001, penalty = 'l2',solver= 'liblinear', random_state=42)
model_LR = model_LR.fit(X_train_scaled, y_train)
y_pred = model_LR.predict(X_test_scaled)
acc_LR = measure(y_test, y_pred)
measure(y_test, y_pred)

In [248]:
# Create KNN Model:
model_KNN = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 24, weights = 'distance')
model_KNN = model_KNN.fit(X_train_scaled, y_train)
y_pred = model_KNN.predict(X_test_scaled)
acc_KNN = measure(y_test, y_pred)
measure(y_test, y_pred)

In [249]:
# Create SVC Model:
model_SVC = SVC(C = 10, gamma = 1, kernel = 'rbf', random_state=42)
model_SVC = model_SVC.fit(X_train_scaled, y_train)
y_pred = model_SVC.predict(X_test_scaled)
acc_SVC = measure(y_test, y_pred)
measure(y_test, y_pred)

In [250]:
# Create DT Model:
model_DT = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, max_features = 'auto',random_state=42)
model_DT = model_DT.fit(X_train_scaled, y_train)
y_pred = model_DT.predict(X_test_scaled)
acc_DT = measure(y_test, y_pred)
measure(y_test, y_pred)

In [251]:
# Create RF Model:
model_RF = RandomForestClassifier(criterion = 'gini',max_depth = 11, max_features = 'auto',n_estimators = 200,random_state=42)
model_RF = model_RF.fit(X_train_scaled, y_train)
y_pred = model_RF.predict(X_test_scaled)
acc_RF = measure(y_test, y_pred)
measure(y_test, y_pred)

In [252]:
# Create ET Model:
model_ET = ExtraTreesClassifier(criterion = 'entropy',max_depth = 14, max_features = 'auto',n_estimators = 300,random_state=42)
model_ET = model_ET.fit(X_train_scaled, y_train)
y_pred = model_ET.predict(X_test_scaled)
acc_ET = measure(y_test, y_pred)
measure(y_test, y_pred)

In [253]:
# Create GB Model:
model_GB = GradientBoostingClassifier(learning_rate = 0.04, max_depth = 10, n_estimators = 100, subsample = 0.5,random_state=42)
model_GB = model_GB.fit(X_train_scaled, y_train)
y_pred = model_GB.predict(X_test_scaled)
acc_GB = measure(y_test, y_pred)
measure(y_test, y_pred)

In [254]:
final_result = pd.Series({'Logistic Regression':acc_LR['accuracy_score'],
             'KNN':acc_KNN['accuracy_score'],
             'SVC':acc_SVC['accuracy_score'],
             'Decision Tree':acc_DT['accuracy_score'],
             'Random Forest':acc_RF['accuracy_score'],
             'Extra Trees':acc_ET['accuracy_score'],
             'Gradient Boosting':acc_GB['accuracy_score']})

In [255]:
final_result = pd.DataFrame(final_result,columns=['Accuracy']).sort_values(by = 'Accuracy', ascending=False)

In [256]:
final_result

**AS We can see Extra Trees has the best performance in comparison with other algorithms.**