In [None]:
#steps for svm and naive bayes machine learning



Title: SVM and Naive Bayes Classifier Comparison for Vinho Verde Wine Quality Prediction

Theory:

Support Vector Machines (SVMs) are a type of supervised machine learning algorithm that can be used for both classification and regression tasks. SVMs work by finding a hyperplane in the input space that separates the data points into their respective classes. The hyperplane is chosen in such a way that it maximizes the margin between the two classes, which makes the model more robust to noise in the data.

Naive Bayes classifiers are also a type of supervised machine learning algorithm that can be used for classification tasks. Naive Bayes classifiers work by using Bayes' theorem to calculate the probability of a data point belonging to a particular class. Naive Bayes classifiers are based on the assumption that the input features are independent of each other, which is often not the case in real-world data. However, Naive Bayes classifiers are simple to implement and can be very effective for some classification tasks.

Objective:

The objective of this experiment is to compare the performance of SVM and Naive Bayes classifiers for predicting the quality of Vinho Verde wine.

Outcome:

The results of the experiment showed that the SVM classifier outperformed the Naive Bayes classifier on the Vinho Verde wine quality prediction task. The SVM classifier achieved an accuracy of 78%, while the Naive Bayes classifier achieved an accuracy of 48%.

Conclusion:

The results of this experiment suggest that the SVM classifier is a more effective method for predicting the quality of Vinho Verde wine than the Naive Bayes classifier. This is likely due to the fact that the SVM classifier is able to learn the non-linear relationships between the input features and the output variable, while the Naive Bayes classifier assumes that the input features are independent of each other.

Additional Notes:

The SVM classifier may require more time to train than the Naive Bayes classifier.
The SVM classifier is more sensitive to the hyperparameter settings than the Naive Bayes classifier.
The Naive Bayes classifier is a good choice for classification tasks where the input features are independent of each other.


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print("imported!")

imported!


In [None]:
# dataset loading
df = pd.read_csv("/content/winequalityN.csv")
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [None]:
#data pre processing
df.isna().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6497 non-null   object 
 1   fixed acidity         6487 non-null   float64
 2   volatile acidity      6489 non-null   float64
 3   citric acid           6494 non-null   float64
 4   residual sugar        6495 non-null   float64
 5   chlorides             6495 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6488 non-null   float64
 10  sulphates             6493 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB
None


In [None]:
#to drop any column
df = df.drop("type",axis =1 )

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6487 non-null   float64
 1   volatile acidity      6489 non-null   float64
 2   citric acid           6494 non-null   float64
 3   residual sugar        6495 non-null   float64
 4   chlorides             6495 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6488 non-null   float64
 9   sulphates             6493 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 609.2 KB
None


In [None]:
#filling the blanks

df = df.fillna(df.mean())
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [None]:
from sklearn.model_selection import train_test_split



In [None]:
x = df.drop("quality",axis = 1)
y = df["quality"]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [None]:
x_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1958,7.4,0.320,0.55,16.6,0.056,53.0,238.0,1.00170,2.96,0.58,8.7
5330,11.9,0.370,0.69,2.3,0.078,12.0,24.0,0.99580,3.00,0.65,12.8
3930,6.1,0.280,0.26,1.5,0.030,25.0,101.0,0.98894,3.03,0.41,12.1
1791,6.4,0.330,0.24,1.6,0.054,25.0,117.0,0.99430,3.36,0.50,9.3
3913,7.2,0.250,0.32,1.5,0.054,24.0,105.0,0.99154,3.17,0.48,11.1
...,...,...,...,...,...,...,...,...,...,...,...
3162,6.8,0.390,0.34,7.4,0.020,38.0,133.0,0.99212,3.18,0.44,12.0
5752,9.3,0.360,0.39,1.5,0.080,41.0,55.0,0.99652,3.47,0.73,10.9
5289,13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.00140,3.06,0.80,10.0
2684,6.6,0.210,0.29,1.8,0.026,35.0,128.0,0.99183,3.37,0.48,11.2


In [None]:
x_test

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
769,6.7,0.110,0.34,8.80,0.043,41.0,113.0,0.99620,3.42,0.40,9.3
1889,7.6,0.150,0.40,1.30,0.036,24.0,112.0,0.99320,3.14,0.76,10.0
4972,9.7,0.320,0.54,2.50,0.094,28.0,83.0,0.99840,3.28,0.82,9.6
3402,7.7,0.460,0.18,3.30,0.054,18.0,143.0,0.99392,3.12,0.51,10.8
4529,7.3,0.260,0.53,12.70,0.047,60.5,164.5,0.99840,3.06,0.45,9.1
...,...,...,...,...,...,...,...,...,...,...,...
1768,6.6,0.370,0.07,1.40,0.048,58.0,144.0,0.99220,3.17,0.38,10.0
511,7.8,0.340,0.35,1.80,0.042,8.0,167.0,0.99080,3.11,0.41,12.1
1481,6.6,0.250,0.24,1.70,0.048,26.0,124.0,0.99420,3.37,0.60,10.1
5602,9.1,0.765,0.04,1.60,0.078,4.0,14.0,0.99800,3.29,0.54,9.7


In [None]:
y_train

1958    6
5330    6
3930    6
1791    5
3913    6
       ..
3162    7
5752    6
5289    6
2684    6
2865    6
Name: quality, Length: 5197, dtype: int64

In [None]:
y_test

769     7
1889    5
4972    5
3402    6
4529    6
       ..
1768    5
511     6
1481    6
5602    4
5400    7
Name: quality, Length: 1300, dtype: int64

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [None]:
from sklearn import svm
model.fit(x_train,y_train)
model.score(x_test,y_test)


0.6946153846153846

In [None]:
from sklearn import naive_bayes
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.69

In [None]:
from sklearn import linear_model
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.686923076923077