In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
from scipy.stats import zscore

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
import warnings
warnings.simplefilter("ignore")
import joblib

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/DSData/master/winequality-red.csv')
df

# EDA

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

As per observation:

There is a big gap between 75% and max values of residual sugar column.
There is a big gap between 75% and max values of free sulfur dioxide column.
There is a huge gap between 75% and max value of total sulfur dioxide column.
All these gaps indicate that there are outliers present in our dataset which might need to be treated so as to get a better model accuracy later.

In [None]:
df.skew()

We will ignore quality since it is our target label in the dataset.
we see that fixed acidity, volatile acidity, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, sulphates and alcohol are all outside the acceptable range of +/-0.5. This skewness indicates outliers being present in our dataset that will need to be treated if required.

# Visualization

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x ='quality', data = df)
plt.xlabel('Quality of Red Wine')
plt.ylabel('Count of Rows in the dataset')
plt.show()

we see the various categories of red wine quality and it shows that the number of data present for quality score 5 and 6 is way higher than it's counterparts. This indicates an imbalance which need to be rectified.

In [None]:
index=0
labels = df['quality']
features = df.drop('quality', axis=1)

for col in features.items():
    plt.figure(figsize=(10,5))
    sns.barplot(x=labels, y=col[index], data=df, color="deeppink")
plt.tight_layout()
plt.show()

Observations: -
-  fixed acidity vs quality - no fixed pattern
-  volatile acidity vs quality - there is a decreasing trend
-  citric acid vs quality - there is an increasing trend
-  residual sugar vs quality - no fixed pattern
-  chlorides vs quality - there is a decreasing trend
-  free sulfur dioxide vs quality - no fixed pattern as it is increasing then decreasing
-  total sulfur dioxide vs quality - no fixed pattern as it is increasing then decreasing
-  density vs quality - no pattern at all
-  pH vs quality - no pattern at all
-  sulphates vs quality - there is an increasing trend
-  alcohol vs quality - there is an increasing trend

We can conclude that to get better quality wine citric acid, sulphates and alcohol columns play a major role.

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(15,10))
index = 0
ax = ax.flatten()
for col, value in df.items():
    sns.boxplot(y=col, data=df, ax=ax[index])
    index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)
plt.show()

Ignoring the continous outlier sections but the outliers that are single values and far away from the whiskers of the boxplot may need to be treated.
trying to retain as much of data which is possible in the given dataset.

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(15,10))
index = 0
ax = ax.flatten()
for col, value in df.items():
    sns.distplot(value, ax=ax[index], hist=False, color="g", kde_kws={"shade": True})
    index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)
plt.show()

The distribution plots show that few of the columns are in normal distribution category showing a proper bell shape curve. However, we do see skewness in most of the feature columns like citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, sulphates and alcohol columns. We are going to ignore the label column since it is a categorical column

# Correlation using a Heatmap

-  Positive correlation - A correlation of +1 indicates a perfect positive correlation, meaning that both variables move in the same direction together.

-  Negative correlation - A correlation of â€“1 indicates a perfect negative correlation, meaning that as one variable goes up, the other goes down.

In [None]:
lower_triangle = np.tril(df.corr())
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', 
            annot_kws={'size':10}, cmap="Spectral", mask=lower_triangle)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

columns fixed acidity and citirc acid are positively correlated with a value of 0.672 which is close to 1. Similary, columns fixed acidity and density are positively correlated with a value of 0.668 again being close to 1. The other 2 column that's positively correlated are free sulfur dioxide and total sulfur dioxide with a value of 0.668 which is close to the value 1. The only negatively correlated columns that pop up are fixed acitidy and pH with a value -0.683 being close to the value -1.

# Dropping a column

In [None]:
df = df.drop('free sulfur dioxide', axis=1)
df

Free sulfur dioxide and total sulfur dioxide are both indicating towards the same feature of sulfur dioxide therefore dropping the free option and keeping just the total option in our dataset.

# Outlier removal

In [None]:
#confirming the number of rows and columns before removing the outliners

df.shape

In [None]:
# Z score method

z=np.abs(zscore(df))
threshold=3
np.where(z>3)

df=df[(z<3).all(axis=1)]
df

Using Z score method to get rid of outliers present in our dataset that are not in the acceptable range of +/-0.5 value of skewness.

In [None]:
#checking number of rows after applying outliner removal
df.shape

In [None]:
# Percentage of Data Loss

data_loss=(1599-1464)/1599*100 
data_loss

# Splitting dataframe

In [None]:
X = df.drop('quality', axis=1)
Y = df['quality']

X represents all the feature columns and Y represents the target label column.

# Creating the training and testing the dataset

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)

In [None]:
# Classification Model Function

def classify(model, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)
    
    # Training the model
    model.fit(X_train, Y_train)
    
    # Predicting Y_test
    pred = model.predict(X_test)
    
    # Accuracy Score
    acc_score = (accuracy_score(Y_test, pred))*100
    print("Accuracy Score:", acc_score)
    
    # Classification Report
    class_report = classification_report(Y_test, pred)
    print("\nClassification Report:\n", class_report)
    
    # Cross Validation Score
    cv_score = (cross_val_score(model, X, Y, cv=5).mean())*100
    print("Cross Validation Score:", cv_score)
    
    # Result of accuracy minus cv scores
    result = acc_score - cv_score
    print("\nAccuracy Score - Cross Validation Score is", result)

In [None]:
# Logistic Regression

model=LogisticRegression()
classify(model, X, Y)

In [None]:
# Support Vector Classifier

model=SVC(C=1.0, kernel='rbf', gamma='auto', random_state=42)
classify(model, X, Y)

In [None]:
# Decision Tree Classifier

model=DecisionTreeClassifier(random_state=21, max_depth=15)
classify(model, X, Y)

In [None]:
# Random Forest Classifier

model=RandomForestClassifier(max_depth=15, random_state=111)
classify(model, X, Y)

In [None]:
# Choosing Support Vector Classifier

svc_param = {'kernel' : ['poly', 'sigmoid', 'rbf'],
             'gamma' : ['scale', 'auto'],
             'shrinking' : [True, False],
             'random_state' : [21,42,104],
             'probability' : [True, False],
             'decision_function_shape' : ['ovo', 'ovr'],
             'verbose' : [True, False]}

In [None]:
GSCV = GridSearchCV(SVC(), svc_param, cv=5)

In [None]:
GSCV.best_params_

In [None]:
Final_Model = SVC(decision_function_shape='ovo', gamma='scale', kernel='rbf', probability=True, random_state=21,
                 shrinking=True, verbose=True)
Classifier = Final_Model.fit(X_train, Y_train)
fmod_pred = Final_Model.predict(X_test)
fmod_acc = (accuracy_score(Y_test, fmod_pred))*100
print("Accuracy score for the Best Model is:", fmod_acc)

# AOC ROC Curve

In [None]:
disp = metrics.plot_roc_curve(Final_Model, X_test, Y_test)
disp.figure_.suptitle("ROC Curve")
plt.show()

generated the ROC Curve for final model and it shows the AUC score for my final model to be of 98%

# Saving the model 

In [None]:
filename = "FinalModel_3.pkl"
joblib.dump(Final_Model, filename)