<a href="https://colab.research.google.com/github/Abhishekpratapsingh321/Wine_Quality_Dataset/blob/main/wine_quality_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

abhishek pratap singh

In [None]:
### importing Pandas and Numpy

import pandas as pd
import numpy as np

### importing Visualisation libraries

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### For Q-Q Plot

import scipy.stats as stats

### To ignore warnings

import warnings
warnings.filterwarnings('ignore')

### Machine Learning libraries

import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### To be able to see maximum columns on screen

pd.set_option('display.max_columns', 400)


In [None]:
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',sep=';')
df.head()

In [None]:
# getting null values and datatypes of all features
df.info()

In [None]:
# statistical info
df.describe()

In [None]:
df.describe().T

In [None]:
### getting unique values for quality feature
df.quality.unique()

In [None]:
### getting count of record for each unique value in quality
df.quality.value_counts()

In [None]:
### checking duplicate values
df.duplicated().sum()


In [None]:
### dropping duplicate values
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
### getting null values in each feature
df.isnull().sum()

In [None]:
### getting correlation between independent and dependent features
corr=round(df.corr(),2)
corr

In [None]:
### getting list of numerical features
features=df.columns
features

In [None]:
### getting count of unique value in each feature
for feature in features:
    print("'{}' has '{}' number of unique features".format(feature,df[feature].nunique()))

In [None]:
### visualising count of quality which is discrete feature
sns.countplot(data=df, x='quality')

In [None]:
sns.set(rc={'figure.figsize':(10,10)})

In [None]:
sns.pairplot(df)

In [None]:
### Getting list of continuous features as only discrete feature is quality
qual=df["quality"]
continuous_features=df.drop("quality",axis=1).columns

In [None]:
continuous_features

In [None]:
### Checking distribution of Continuous numerical features
for feature in continuous_features:
    plt.figure(figsize=(15,6))
    plt.subplot(121)
    sns.histplot(data=df, x=feature)
    plt.title("{}'s distribution".format(feature),fontweight="bold", fontsize=15)

    plt.subplot(122)
    stats.probplot(df[feature], dist='norm', plot=plt)
    plt.title("{}'s Q-Q Plot".format(feature),fontweight="bold", fontsize=15)
    plt.show();

In [None]:
### Comparing Continuous numerical features with quality feature
palette1=sns.color_palette("tab10", 6)
plt.figure(figsize=(15,45))
for feature in enumerate(continuous_features):
    plt.subplot(6, 2, feature[0]+1)
    sns.set(rc={'figure.figsize':(7,7)})
    sns.kdeplot(data=df, x=feature[1], hue='quality', palette=palette1, fill=True)
    plt.title("{} Vs quality".format(feature[1]),fontweight="bold", fontsize=15)

In [None]:

### Checking outliers in numerical features
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
    plt.subplot(6, 2, feature[0]+1)
    sns.set(rc={'figure.figsize':(10,6)})
    sns.boxplot(data=df, x=feature[1], color='y')
    plt.title("{}".format(feature[1]), fontweight="bold", fontsize=15)

In [None]:
### getting outliers in features for each unique value in quality feature
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
 plt.subplot(6, 2, feature[0]+1)
 sns.set(rc={'figure.figsize':(10,6)})
 sns.boxplot(data=df, y=feature[1], x='quality')
 plt.title("{} vs quality".format(feature[1]),fontsize=15, fontweight="bold")

In [None]:
### visualising data scatter in each continuous feature with respect to quality
plt.figure(figsize=(20,50))
for feature in enumerate(continuous_features):
 plt.subplot(6, 2, feature[0]+1)
 sns.set(rc={'figure.figsize':(7,8)})
 sns.stripplot(data=df, y=feature[1], x='quality')
 plt.title("{} Vs quality".format(feature[1]),fontsize=15, fontweight="bold")

In [None]:
### plotting regplot for features vs modified quality
plt.figure(figsize=(20,55))
for feature in enumerate(continuous_features):
 plt.subplot(6, 2, feature[0]+1)
 sns.set(rc={'figure.figsize':(8,9)})
 sns.regplot(data=df, x=feature[1], y='quality')
 plt.xlabel(feature[1])
 plt.ylabel("quality")
 plt.title("{} Vs quality".format(feature[1]), fontweight='bold', fontsize=15)

In [None]:
#masking the rare categories having values less than 20% as new category
frequencies = df['quality'].value_counts(normalize=True)
mapping=df['quality'].map(frequencies)
df['quality']=df['quality'].mask(mapping<0.2, 9)

In [None]:
df['quality'].value_counts()

In [None]:
### visualising count of modified quality which is discrete feature
sns.set(rc={'figure.figsize':(7,5)})
sns.countplot(data=df, x='quality')

In [None]:
### Comparing Continuous numerical features with modified quality feature
palette1=sns.color_palette("tab10", 3)
plt.figure(figsize=(15,45))
for feature in enumerate(continuous_features):
 plt.subplot(6, 2, feature[0]+1)
 sns.set(rc={'figure.figsize':(7,7)})
 sns.kdeplot(data=df, x=feature[1], hue='quality', palette=palette1, fill=True)
 plt.title("{} Vs quality".format(feature[1]),fontweight="bold", fontsize=15)

In [None]:
### getting outliers in features for each unique value in modified quality feature
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
 plt.subplot(6, 2, feature[0]+1)
 sns.set(rc={'figure.figsize':(10,6)})
 sns.boxplot(data=df, y=feature[1], x='quality')
 plt.title("{} vs quality".format(feature[1]),fontsize=15, fontweight="bold")

In [None]:
### getting correlation between independent and modified dependent features
corr=round(df.corr(),2)
corr

In [None]:
### Plotting heatmap for visualising the correlation between features
sns.set(rc={'figure.figsize':(15,10)})
sns.heatmap(data=corr, annot=True, vmin=-1, vmax=1, cmap="YlGnBu")

In [None]:
### plotting regplot for features vs modified quality
plt.figure(figsize=(20,40))
for feature in enumerate(continuous_features):
 plt.subplot(6, 2, feature[0]+1)
 sns.set(rc={'figure.figsize':(8,7)})
 sns.regplot(data=df, x=feature[1], y='quality')
 plt.xlabel(feature[1])
 plt.ylabel("quality")
 plt.title("{} Vs quality".format(feature[1]), fontweight='bold', fontsize=15)

In [None]:
### exporting dataset to csv
df.to_csv('winedataset.csv')

In [None]:

df['quality'].value_counts()


**Splitting data into independent and dependent**

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]
X.head(3)

In [None]:
y.head(3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=10)

In [None]:
X_train.head(3)

In [None]:
y_train.head(3)


In [None]:
X_test.head(3)

**Feature Scaling for some**

In [None]:
### Creating copy of test and training data for feature scaling
X_train1=X_train
X_train1.head(3)

In [None]:
X_test1=X_test
X_test1.head(3)

In [None]:
y_train1=y_train

In [None]:
y_train1.head(3)

In [None]:
y_test1=y_test
y_test1.head(3)

In [None]:
scalar=StandardScaler()
scalar


In [None]:
X_train1=scalar.fit_transform(X_train1)
X_train1


In [None]:
X_test1=scalar.transform(X_test)
X_test1

**Model and Model Training and Model prediction**

** Decision Tree Classifier**

In [None]:
dtc=DecisionTreeClassifier()
dtc

In [None]:
dtc.fit(X_train, y_train)

In [None]:
dtc_prediction=dtc.predict(X_test)
dtc_prediction

In [None]:

print(classification_report(y_test,dtc_prediction ))

In [None]:
### Visualising the Decision Tree
from sklearn import tree
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(25,20))
clf = dtc.fit(X_train, y_train)
tree.plot_tree(clf, filled=True)
plt.show();

** Logistic Regression**

In [None]:
log_reg=LogisticRegression()
log_reg

In [None]:
log_reg.fit(X_train1, y_train1)

In [None]:
log_reg_pred=log_reg.predict(X_test1)
log_reg_pred

In [None]:
print(classification_report(y_test1, log_reg_pred))

** Support Vector Classifier**

In [None]:
svc=SVC()
svc


In [None]:
svc.fit(X_train1, y_train1)

In [None]:
svc_pred=svc.predict(X_test1)
svc_pred

In [None]:
print(classification_report(y_test1, svc_pred))

**Random Forest Classifier**

In [None]:
rand_for=RandomForestClassifier()
rand_for

In [None]:
rand_for.fit(X_train, y_train)

In [None]:
rand_for_pred=rand_for.predict(X_test)
rand_for_pred

In [None]:
print(classification_report(y_test, rand_for_pred))

**Hyper-Parameter Tuning of Model**

** Hyper-Parameter Tuning Decision Tree Classifier Model**

In [None]:
dtc=DecisionTreeClassifier()
dtc

In [None]:

grid_param = {
 'criterion': ['gini', 'entropy'],
 'max_depth' : range(2,32,1),
 'min_samples_leaf' : range(1,10,1),
 'min_samples_split': range(2,10,1),
 'splitter' : ['best', 'random']
}


In [None]:
grid=GridSearchCV(estimator=dtc,param_grid=grid_param, cv=3)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
dtc_best_par=DecisionTreeClassifier(criterion='entropy',max_depth=17, min_samples_leaf=9, min_samples_split=3, splitter='random')
dtc_best_par

In [None]:
dtc_best_par.fit(X_train, y_train)

In [None]:
dtc_best_par_pred=dtc_best_par.predict(X_test)
dtc_best_par_pred

In [None]:
print(classification_report(y_test,dtc_best_par_pred ))

In [None]:
fig = plt.figure(figsize=(25,20))
clf = dtc_best_par.fit(X_train, y_train)
tree.plot_tree(clf, filled=True, fontsize=10)
plt.show();

Hyper-Parameter Tuning Random Forest Classifier Model

In [None]:
rand_for=RandomForestClassifier()
rand_for

In [None]:
rand_for=RandomForestClassifier()
rand_for


In [None]:
# we are tuning three hyperparameters right now, we are passing the different values for both parameters
grid_param = {
 "n_estimators" : [90,100,110],
 'criterion': ['gini', 'entropy'],
 'max_depth' : range(2,20,2),
 'min_samples_leaf' : range(1,10,2),
}


In [None]:
grid=GridSearchCV(estimator=rand_for,param_grid=grid_param, verbose=1, cv=2)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
rand_for_best_par=RandomForestClassifier(criterion='entropy', max_depth=18, n_estimators=90)
rand_for_best_par


In [None]:

rand_for_best_par.fit(X_train, y_train)

In [None]:
rand_for_best_par_pred=rand_for_best_par.predict(X_test)
rand_for_best_par_pred

In [None]:
print(classification_report(y_test,rand_for_best_par_pred ))

 Hyper-Parameter Tuning Logistic Regression Model

In [None]:

log_reg=LogisticRegression()
log_reg

In [None]:
grid_param = {
 "penalty" : ['l1', 'l2', 'elasticnet', 'none'],
 'C': np.logspace(-4, 4, 3),
 "solver": ['newton-cg', 'lbfgs', 'sag', 'saga'],
 'max_iter' : [100, 200]
}

In [None]:
grid=GridSearchCV(estimator=log_reg,param_grid=grid_param, verbose=1, cv=2)

In [None]:
grid.fit(X_train1, y_train1)


In [None]:

grid.best_params_

In [None]:
grid.best_estimator_


In [None]:
log_reg_best_par=LogisticRegression(max_iter=200, penalty='l1', solver='saga')
log_reg_best_par

In [None]:
log_reg_best_par.fit(X_train1, y_train1)

In [None]:
log_reg_best_par_pred=log_reg_best_par.predict(X_test1)
log_reg_best_par_pred


In [None]:
print(classification_report(y_test1,log_reg_best_par_pred))

Hyper-Parameter Tuning Support Vector Classifier Model

In [None]:
svc=SVC()
svc

In [None]:
grid_param={'C': [10, 100, 200, 500],
 'degree': [2, 3, 4, 5],
 'kernel': [ 'rbf', 'poly', 'sigmoid']
 }

In [None]:
grid=GridSearchCV(estimator=svc,param_grid=grid_param, verbose=1, cv=2)

In [None]:
grid.fit(X_train1, y_train1)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
svc_best_par=SVC(C=10, kernel='poly')
svc_best_par

In [None]:
svc_best_par.fit(X_train1, y_train1)

In [None]:
svc_best_par_pred=svc_best_par.predict(X_test1)
svc_best_par_pred

In [None]:

print(classification_report(y_test1,svc_best_par_pred ))