In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('winequality.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
df.shape

(6497, 13)

In [4]:
df.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [7]:
df.duplicated().sum()

1168

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.duplicated().sum()

0

In [10]:
df.shape

(5295, 13)

In [11]:
df.dtypes

type                     object
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [12]:
df.drop(['type'],axis=1,inplace=True)

In [13]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [14]:
d1=df['quality'].value_counts()
d1

6    2311
5    1745
7     852
4     204
8     148
3      30
9       5
Name: quality, dtype: int64

In [16]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]
print(type(x))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [17]:
print(x.shape)
print(y.shape)

(5295, 11)
(5295,)


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3971, 11)
(1324, 11)
(3971,)
(1324,)


In [21]:
def gen_cls_metrics(ytest,ypred):
    print('Accuracy Score',accuracy_score(ytest,ypred))
    cm=confusion_matrix(ytest,ypred)
    print(cm)
    print(classification_report(ytest,ypred,zero_division=0))

def train_test_score(model):
    print('Training Score',model.score(x_train,y_train))
    print('Testing Score',model.score(x_test,y_test))

## SVM

In [22]:
m1=SVC(kernel='linear',C=1)
m1.fit(x_train,y_train)

SVC(C=1, kernel='linear')

In [23]:
train_test_score(m1)

Training Score 0.537899773356837
Testing Score 0.5392749244712991


In [24]:
ypred_m1=m1.predict(x_test)
print(ypred_m1)

[6 6 5 ... 6 6 6]


In [25]:
print('Metrics for SVM Classifier')
gen_cls_metrics(y_test,ypred_m1)

Metrics for SVM Classifier
Accuracy Score 0.5392749244712991
[[  0   0   2   3   0   0]
 [  0   0  37  14   0   0]
 [  0   0 265 164   0   0]
 [  0   0 136 449   0   0]
 [  0   0   6 214   0   0]
 [  0   0   2  32   0   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00        51
           5       0.59      0.62      0.60       429
           6       0.51      0.77      0.61       585
           7       0.00      0.00      0.00       220
           8       0.00      0.00      0.00        34

    accuracy                           0.54      1324
   macro avg       0.18      0.23      0.20      1324
weighted avg       0.42      0.54      0.47      1324



In [31]:
print('Accuracy Score for Decision Tree Classifier:',accuracy_score(y_test,ypred_m1)*100)

Accuracy Score for Decision Tree Classifier: 53.92749244712991


## Decision Tree

In [26]:
m2=DecisionTreeClassifier(criterion='gini',max_depth=7,min_samples_split=12)
m2.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=7, min_samples_split=12)

In [27]:
train_test_score(m2)

Training Score 0.6325862503147822
Testing Score 0.5120845921450151


In [28]:
ypred_m2=m2.predict(x_test)
print(ypred_m2)

[5 5 5 ... 6 6 6]


In [29]:
print('Metrics for Decision Tree Classifier')
gen_cls_metrics(y_test,ypred_m2)

Metrics for Decision Tree Classifier
Accuracy Score 0.5120845921450151
[[  0   0   1   2   2   0]
 [  0   1  37   8   5   0]
 [  0   3 250 162  14   0]
 [  0   4 162 346  73   0]
 [  0   0  19 120  81   0]
 [  0   0   0  21  13   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.12      0.02      0.03        51
           5       0.53      0.58      0.56       429
           6       0.53      0.59      0.56       585
           7       0.43      0.37      0.40       220
           8       0.00      0.00      0.00        34

    accuracy                           0.51      1324
   macro avg       0.27      0.26      0.26      1324
weighted avg       0.48      0.51      0.49      1324



In [30]:
print('Accuracy Score for Decision Tree Classifier:',accuracy_score(y_test,ypred_m2)*100)

Accuracy Score for Decision Tree Classifier: 51.20845921450151
