# Importing Necessary Packages 

In [153]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [154]:
df = pd.read_csv('winequalityN.csv')
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [155]:
df.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

# Null values imputation

In [156]:
# Fill numeric columns with mean (or median)
df.fillna(df.mean(numeric_only=True), inplace=True)

df.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

# Encoding of non numeric columns(type)

In [157]:
df.dtypes

type                     object
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [158]:
# Encoding 0 for red,1 for white
df['type'] = df['type'].map({'red': 0, 'white': 1})

df.dtypes

type                      int64
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

# Implementing feature selection using tree

In [159]:
df['quality'].value_counts().sort_index()


quality
3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
Name: count, dtype: int64

In [160]:
#1 = good wine (quality >= 6)
#0 = not good wine (quality <6)

df['wine_quality_category'] = df['quality'].apply(lambda q: 1 if q >= 6 else 0)
df


Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_quality_category
0,1,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,6,1
1,1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,6,1
2,1,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,6,1
3,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6,1
4,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,5,0
6493,0,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2,6,1
6494,0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,6,1
6495,0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,5,0


In [161]:
df.dtypes

type                       int64
fixed acidity            float64
volatile acidity         float64
citric acid              float64
residual sugar           float64
chlorides                float64
free sulfur dioxide      float64
total sulfur dioxide     float64
density                  float64
pH                       float64
sulphates                float64
alcohol                  float64
quality                    int64
wine_quality_category      int64
dtype: object

In [162]:
X = df.drop(['quality', 'wine_quality_category'],axis = 1)
y = df[['wine_quality_category']]

clf = DecisionTreeClassifier(criterion = 'entropy')
clf.fit(X,y)
clf.feature_importances_
clf.feature_names_in_

array(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype=object)

In [163]:
feature_scores= pd.DataFrame()
feature_scores['Name'] = clf.feature_names_in_
feature_scores['Importance'] = clf.feature_importances_
feature_scores

Unnamed: 0,Name,Importance
0,type,0.001935
1,fixed acidity,0.066685
2,volatile acidity,0.129016
3,citric acid,0.06705
4,residual sugar,0.062425
5,chlorides,0.085693
6,free sulfur dioxide,0.087599
7,total sulfur dioxide,0.080463
8,density,0.068592
9,pH,0.068211


# Creating Naive Byes Classifier Model

In [164]:
# Creating Naive Byes Classifier Model

X_encoded = df[['type','fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide',
                        'total sulfur dioxide','density','pH','sulphates','alcohol']]
y_encoded = df['wine_quality_category']

X_train,X_test,y_train,y_test = train_test_split(X_encoded,y_encoded,test_size = 0.2,random_state = 42)

In [165]:
clf1 = GaussianNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.6961538461538461
[[245 223]
 [172 660]]
              precision    recall  f1-score   support

           0       0.59      0.52      0.55       468
           1       0.75      0.79      0.77       832

    accuracy                           0.70      1300
   macro avg       0.67      0.66      0.66      1300
weighted avg       0.69      0.70      0.69      1300

