<a href="https://www.kaggle.com/code/bhavkaur/red-wine-interactive-eda-baseline-modelling?scriptVersionId=206453401" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="background-color: #FF777988; padding: 20px; border-radius: 16px; border: 3px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #2B3A55; font-weight: bold; font-size: 42px;">
    🍷 Red Wine | Interactive EDA & Baseline Modelling
    </h1>
</div>

In [126]:
import numpy as np 
import pandas as pd 
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder

import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly import tools
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# **Reading data**

In [2]:
wf=pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [3]:
wf.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
wf.shape

(1599, 12)

# **Exploratory Data Analysis**

In [113]:
fig = px.histogram(wf, x="quality", title='1. Distribution of Quality',color_discrete_sequence=['#90D5FF'],text_auto=True)
fig.update_traces(marker_line_width=3,marker_line_color="#5178C1")
fig.show(renderer='iframe_connected')

In [114]:
fig = px.scatter(wf, x="citric acid", y='pH', 
                 title='2. Distribution of pH vs Citric Acid',
                 trendline="ols",trendline_color_override="#FA163F",
                 color_discrete_sequence=['#fc8a9e']
                )
fig.show(renderer='iframe_connected')

In [115]:
fig = px.scatter(wf, x="fixed acidity", y='volatile acidity',
                 title='3. Distribution of Fixed Acidity vs Volatile Acidity',
                 trendline="ols",trendline_color_override="#427D9D",
                 color_discrete_sequence=['#93b3c9']
                )
fig.show(renderer='iframe_connected')

In [119]:
fig = px.scatter(wf, x="fixed acidity", y='density',
                 title='4. Distribution of Fixed Acidity vs Density',
                 trendline="ols",trendline_color_override="#EC8F5E",
                 color_discrete_sequence=['#f5c7ae']
                )
fig.show(renderer='iframe_connected')

In [120]:
fig = px.scatter(wf, x="alcohol", y='density',
                 title='4. Distribution of Alcohol vs Density',
                 trendline="ols",trendline_color_override="#025DAD",
                 color_discrete_sequence=['#73a3d1']
                )
fig.show(renderer='iframe_connected')

In [123]:
fig = px.scatter(wf, x="pH", y='density',
                 title='5. Distribution of pH vs Density',
                 trendline="ols",trendline_color_override="#FA163F",
                 color_discrete_sequence=['#fc8a9e']
                )
fig.show(renderer='iframe_connected')

In [124]:
fig = px.imshow(wf.corr(), title='6. Correlation Heatmap',text_auto=".1f",aspect="auto")
fig.show(renderer='iframe_connected')

# **Data Preprocessing**

In [37]:
x=wf.drop(['quality'], axis=1)
y=wf['quality']

#handle unbalanced data
#to do this, we use oversampling

os =SMOTE()           # Synthetic Minority Over-sampling Technique
x_res,y_res= os.fit_resample(x, y)

le = LabelEncoder()   # Fix 'y' using Label Encoder
y = le.fit_transform(y)

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x_res,y_res,test_size=0.2, random_state=0)

In [39]:
stdscale = StandardScaler().fit(x_train)
x_train_std = stdscale.transform(x_train)
x_test_std = stdscale.transform(x_test)

# **Logistic Regression**

In [125]:
lr = LogisticRegression()
lr.fit(x_train_std, y_train)
predictions = lr.predict(x_test_std)
lr_acc = accuracy_score(y_test, predictions)
print("Accuracy of Log Regression = ",lr_acc)

Accuracy of Log Regression =  0.5904645476772616


In [131]:
cnf_matrix = confusion_matrix(y_test, predictions)
fig = px.imshow(cnf_matrix, title='Confusion Matrix for Logistic Regression',text_auto=".0f",aspect="auto")
fig.show(renderer='iframe_connected')

# **Decision Tree Classifier**

In [132]:
dt = DecisionTreeClassifier()
dt.fit(x_train_std, y_train)
predictions = dt.predict(x_test_std)
dt_acc = accuracy_score(y_test, predictions)
print("Accuracy of Decision Tree = ",dt_acc)

Accuracy of Decision Tree =  0.7787286063569682


In [133]:
cnf_matrix = confusion_matrix(y_test, predictions)
fig = px.imshow(cnf_matrix, title='Confusion Matrix for Decision Tree Classifier',text_auto=".0f",aspect="auto")
fig.show(renderer='iframe_connected')

# **Random Forest Classifier**

In [134]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(x_train_std, y_train)
predictions = rf.predict(x_test_std)
rf_acc = accuracy_score(y_test, predictions)
print("Accuracy of Random Forest = ",rf_acc)

Accuracy of Random Forest =  0.8508557457212714


In [135]:
cnf_matrix = confusion_matrix(y_test, predictions)
fig = px.imshow(cnf_matrix, title='Confusion Matrix for Random Forest Classifier',text_auto=".0f",aspect="auto")
fig.show(renderer='iframe_connected')

# **XGB Classifier**

In [136]:
xgb = XGBClassifier(random_state=42)
xgb.fit(x_train_std, y_train)
predictions = xgb.predict(x_test_std)
xgb_acc = accuracy_score(y_test, predictions)
print("Accuracy of eXtreme Gradient Boosting = ",xgb_acc)

Accuracy of eXtreme Gradient Boosting =  0.8557457212713936


In [137]:
cnf_matrix = confusion_matrix(y_test, predictions)
fig = px.imshow(cnf_matrix, title='Confusion Matrix for XGB Classifier',text_auto=".0f",aspect="auto")
fig.show(renderer='iframe_connected')

# **LGBM Classifier**

In [138]:
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(x_train_std, y_train)
predictions = lgbm.predict(x_test_std)
lgbm_acc = accuracy_score(y_test, predictions)
print("Accuracy of Light Gradient Boosting Machine = ",lgbm_acc)

Accuracy of Light Gradient Boosting Machine =  0.8643031784841075


In [139]:
cnf_matrix = confusion_matrix(y_test, predictions)
fig = px.imshow(cnf_matrix, title='Confusion Matrix for LGBM Classifier',text_auto=".0f",aspect="auto")
fig.show(renderer='iframe_connected')