# IMPORT DEPENDENCIES

In [257]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# LOADING DATASET

In [258]:
df = pd.read_csv("winequality-red.csv")

In [259]:
df.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
200,9.6,0.32,0.47,1.4,0.056,9.0,24.0,0.99695,3.22,0.82,10.3,7
707,7.4,0.49,0.19,3.0,0.077,16.0,37.0,0.9966,3.37,0.51,10.5,5
455,11.3,0.62,0.67,5.2,0.086,6.0,19.0,0.9988,3.22,0.69,13.4,8
466,10.3,0.5,0.42,2.0,0.069,21.0,51.0,0.9982,3.16,0.72,11.5,6
369,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7
1043,9.5,0.39,0.41,8.9,0.069,18.0,39.0,0.99859,3.29,0.81,10.9,7
494,6.5,0.39,0.23,8.3,0.051,28.0,91.0,0.9952,3.44,0.55,12.1,6
152,7.5,0.6,0.03,1.8,0.095,25.0,99.0,0.995,3.35,0.54,10.1,5
933,7.4,0.61,0.01,2.0,0.074,13.0,38.0,0.99748,3.48,0.65,9.8,5
234,8.2,1.0,0.09,2.3,0.065,7.0,37.0,0.99685,3.32,0.55,9.0,6


# DATA PREPROCESSING

In [260]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [261]:
df['quality'] = df['quality'].astype(float)

In [262]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [263]:
df.duplicated()

0       False
1       False
2       False
3       False
4        True
        ...  
1594    False
1595    False
1596     True
1597    False
1598    False
Length: 1599, dtype: bool

In [264]:
df.drop_duplicates(inplace=True)

In [265]:
df.duplicated()

0       False
1       False
2       False
3       False
5       False
        ...  
1593    False
1594    False
1595    False
1597    False
1598    False
Length: 1359, dtype: bool

In [266]:
df.corr()['quality']

fixed acidity           0.119024
volatile acidity       -0.395214
citric acid             0.228057
residual sugar          0.013640
chlorides              -0.130988
free sulfur dioxide    -0.050463
total sulfur dioxide   -0.177855
density                -0.184252
pH                     -0.055245
sulphates               0.248835
alcohol                 0.480343
quality                 1.000000
Name: quality, dtype: float64

In [267]:
df.head(15)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6.0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5.0
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5.0
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7.0
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7.0
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5.0
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,5.0


In [268]:
df["quality"]=df["quality"].apply(lambda x:1 if x>=7 else 0)
df["quality"]

0       0
1       0
2       0
3       0
5       0
       ..
1593    0
1594    0
1595    0
1597    0
1598    0
Name: quality, Length: 1359, dtype: int64

In [269]:
df.sample(15)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
670,6.9,0.4,0.24,2.5,0.083,30.0,45.0,0.9959,3.26,0.58,10.0,0
878,8.8,0.61,0.19,4.0,0.094,30.0,69.0,0.99787,3.22,0.5,10.0,0
1443,6.9,0.58,0.2,1.75,0.058,8.0,22.0,0.99322,3.38,0.49,11.7,0
887,10.7,0.52,0.38,2.6,0.066,29.0,56.0,0.99577,3.15,0.79,12.1,1
862,7.5,0.42,0.32,2.7,0.067,7.0,25.0,0.99628,3.24,0.44,10.4,0
97,7.0,0.5,0.25,2.0,0.07,3.0,22.0,0.9963,3.25,0.63,9.2,0
378,11.4,0.625,0.66,6.2,0.088,6.0,24.0,0.9988,3.11,0.99,13.3,0
253,7.7,0.775,0.42,1.9,0.092,8.0,86.0,0.9959,3.23,0.59,9.5,0
1429,7.9,0.18,0.4,2.2,0.049,38.0,67.0,0.996,3.33,0.93,11.3,0
1069,8.0,0.62,0.35,2.8,0.086,28.0,52.0,0.997,3.31,0.62,10.8,0


# EXPLONATORY DATA ANALYSIS

In [270]:
sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x2543528c130>

In [271]:
sns.countplot(df["quality"])



<AxesSubplot:xlabel='quality', ylabel='count'>

In [272]:
sns.heatmap(df.corr(),annot=True)

<AxesSubplot:>

# SPLITTING DATA

In [273]:
x=df.iloc[:,[0,1,2,4,6,7,9,10]]

In [274]:
x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol
0,7.4,0.700,0.00,0.076,34.0,0.99780,0.56,9.4
1,7.8,0.880,0.00,0.098,67.0,0.99680,0.68,9.8
2,7.8,0.760,0.04,0.092,54.0,0.99700,0.65,9.8
3,11.2,0.280,0.56,0.075,60.0,0.99800,0.58,9.8
5,7.4,0.660,0.00,0.075,40.0,0.99780,0.56,9.4
...,...,...,...,...,...,...,...,...
1593,6.8,0.620,0.08,0.068,38.0,0.99651,0.82,9.5
1594,6.2,0.600,0.08,0.090,44.0,0.99490,0.58,10.5
1595,5.9,0.550,0.10,0.062,51.0,0.99512,0.76,11.2
1597,5.9,0.645,0.12,0.075,44.0,0.99547,0.71,10.2


In [275]:
y=df.iloc[:,11:12]

In [276]:
y.sample(10)

Unnamed: 0,quality
986,1
454,0
1315,0
598,0
1478,0
1411,0
796,0
523,0
1139,0
195,0


In [277]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [278]:
x_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol
177,7.5,0.52,0.42,0.087,38.0,0.99720,0.61,10.5
1366,7.3,0.74,0.08,0.094,45.0,0.99576,0.50,9.8
617,11.5,0.31,0.51,0.079,28.0,0.99820,0.93,9.8
814,12.6,0.41,0.54,0.103,41.0,0.99939,0.76,11.3
1170,9.2,0.36,0.34,0.062,12.0,0.99667,0.67,10.5
...,...,...,...,...,...,...,...,...
1285,11.3,0.37,0.50,0.090,47.0,0.99734,0.57,10.5
1329,7.4,0.60,0.26,0.083,91.0,0.99616,0.56,9.8
1526,6.8,0.47,0.08,0.064,38.0,0.99553,0.65,9.6
1011,8.9,0.32,0.31,0.088,19.0,0.99570,0.55,10.4


In [279]:
y_train

Unnamed: 0,quality
177,0
1366,0
617,0
814,0
1170,0
...,...
1285,0
1329,0
1526,0
1011,0


# DECISION TREE

In [280]:
DT=DecisionTreeClassifier()

In [281]:
DT.fit(x_train,y_train)

In [282]:
y_pred=DT.predict(x_test)

accuracy_score(y_test,y_pred)

0.8529411764705882

# Pickling

In [283]:
import pickle
pickle.dump(DT,open("model.pkl","wb"))

In [284]:
x_train.sample(15)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol
1488,5.6,0.54,0.04,0.049,13.0,0.9942,0.58,11.4
336,8.9,0.43,0.45,0.052,16.0,0.9948,0.7,12.5
529,9.9,0.63,0.24,0.077,33.0,0.9974,0.57,9.4
1497,6.9,0.74,0.03,0.054,16.0,0.99508,0.63,11.5
169,7.5,0.705,0.24,0.36,63.0,0.9964,1.59,9.5
516,12.5,0.6,0.49,0.1,14.0,1.001,0.74,11.9
839,6.0,0.5,0.04,0.092,26.0,0.99647,0.47,10.0
399,8.7,0.765,0.22,0.064,42.0,0.9963,0.55,9.4
468,11.4,0.36,0.69,0.09,21.0,1.0,0.62,9.2
214,7.8,0.735,0.08,0.092,41.0,0.9974,0.71,9.8
