# Wine dataset analysis


## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Importing the dataset

In [1]:
from sklearn.datasets import load_wine
wine = load_wine()

In [3]:
print(wine['DESCR'])

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [4]:
dataset = pd.DataFrame(wine['data'], columns=wine['feature_names'])
dataset['target'] = wine['target']
dataset

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [5]:
dataset.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

In [6]:
X = dataset.iloc[:, :-1].values
y = dataset['target'].values

In [7]:
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [8]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

## Spliting the dataset into train and test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

## Fitting the logistic regression model

In [10]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
y_pred = model.predict(X_test)
y_pred

array([1, 2, 2, 2, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 2, 2, 1, 1,
       2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 0, 2])

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 8,  1,  0],
       [ 0, 16,  0],
       [ 0,  1, 10]])

In [15]:
print(accuracy_score(y_test, y_pred))

0.9444444444444444


In [16]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.97 
Standard Deviation: 2.67 


## Fitting the SVM Model

In [29]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [30]:
y_pred = svm.predict(X_test)

In [31]:
confusion_matrix(y_test, y_pred)

array([[ 9,  0,  0],
       [ 2, 14,  0],
       [ 0,  1, 10]])

In [32]:
print(accuracy_score(y_test, y_pred))

0.9166666666666666


In [33]:
accuracies = cross_val_score(estimator=svm, X=X_train, y=y_train, cv=5)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.96 
Standard Deviation: 2.63 


## Fitting the Naivebayes model

In [34]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [35]:
y_pred = nb_model.predict(X_test)

In [36]:
confusion_matrix(y_test, y_pred)

array([[ 8,  1,  0],
       [ 0, 16,  0],
       [ 0,  0, 11]])

In [37]:
print(accuracy_score(y_test, y_pred))


0.9722222222222222


In [38]:
accuracies = cross_val_score(estimator=nb_model, X=X_train, y=y_train, cv=5)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.97 
Standard Deviation: 1.41 


## Fitting the DecisionTree Model

In [39]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [40]:
y_pred = dt_model.predict(X_test)

In [41]:
confusion_matrix(y_test, y_pred)

array([[ 9,  0,  0],
       [ 1, 14,  1],
       [ 0,  1, 10]])

In [42]:
accuracy_score(y_test, y_pred)

0.9166666666666666

In [43]:
accuracies = cross_val_score(estimator=dt_model, X=X_train, y=y_train, cv=5)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.86 
Standard Deviation: 6.37 


## Fitting the RandomForest Model

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=50)
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [45]:
y_pred = rf_model.predict(X_test)

In [46]:
confusion_matrix(y_test, y_pred)

array([[ 9,  0,  0],
       [ 1, 15,  0],
       [ 0,  0, 11]])

In [47]:
accuracy_score(y_test, y_pred)

0.9722222222222222

In [48]:
accuracies = cross_val_score(estimator=rf_model, X=X_train, y=y_train, cv=5)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

Accuracy: 0.97 
Standard Deviation: 2.59 


## Fitting the catboost model

In [49]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier()
cat_model.fit(X_train, y_train)

97s	remaining: 5.01s
614:	learn: 0.0067725	total: 7.98s	remaining: 4.99s
615:	learn: 0.0067519	total: 7.99s	remaining: 4.98s
616:	learn: 0.0067424	total: 8.01s	remaining: 4.97s
617:	learn: 0.0067345	total: 8.04s	remaining: 4.97s
618:	learn: 0.0067223	total: 8.05s	remaining: 4.96s
619:	learn: 0.0067123	total: 8.06s	remaining: 4.94s
620:	learn: 0.0066990	total: 8.08s	remaining: 4.93s
621:	learn: 0.0066851	total: 8.09s	remaining: 4.92s
622:	learn: 0.0066731	total: 8.12s	remaining: 4.91s
623:	learn: 0.0066634	total: 8.12s	remaining: 4.89s
624:	learn: 0.0066508	total: 8.14s	remaining: 4.88s
625:	learn: 0.0066354	total: 8.15s	remaining: 4.87s
626:	learn: 0.0066236	total: 8.16s	remaining: 4.86s
627:	learn: 0.0066112	total: 8.18s	remaining: 4.84s
628:	learn: 0.0066013	total: 8.19s	remaining: 4.83s
629:	learn: 0.0065889	total: 8.2s	remaining: 4.82s
630:	learn: 0.0065816	total: 8.22s	remaining: 4.8s
631:	learn: 0.0065674	total: 8.23s	remaining: 4.79s
632:	learn: 0.0065574	total: 8.24s	remaining:

<catboost.core.CatBoostClassifier at 0x7f91d504eed0>

In [50]:
y_pred = cat_model.predict(X_test)

In [51]:
confusion_matrix(y_test, y_pred)

array([[ 9,  0,  0],
       [ 1, 15,  0],
       [ 0,  0, 11]])

In [52]:
accuracy_score(y_test, y_pred)

0.9722222222222222

In [53]:
accuracies = cross_val_score(estimator=cat_model, X=X_train, y=y_train, cv=5)
print("Accuracy: {:.2f} ".format(accuracies.mean()))
print("Standard Deviation: {:.2f} ".format(accuracies.std()*100))

emaining: 3.95s
615:	learn: 0.0082250	total: 6.32s	remaining: 3.94s
616:	learn: 0.0082095	total: 6.32s	remaining: 3.92s
617:	learn: 0.0081951	total: 6.33s	remaining: 3.91s
618:	learn: 0.0081793	total: 6.34s	remaining: 3.9s
619:	learn: 0.0081696	total: 6.35s	remaining: 3.89s
620:	learn: 0.0081550	total: 6.35s	remaining: 3.88s
621:	learn: 0.0081357	total: 6.36s	remaining: 3.87s
622:	learn: 0.0081210	total: 6.38s	remaining: 3.86s
623:	learn: 0.0081069	total: 6.38s	remaining: 3.85s
624:	learn: 0.0080959	total: 6.39s	remaining: 3.84s
625:	learn: 0.0080802	total: 6.4s	remaining: 3.82s
626:	learn: 0.0080634	total: 6.41s	remaining: 3.81s
627:	learn: 0.0080503	total: 6.42s	remaining: 3.8s
628:	learn: 0.0080353	total: 6.42s	remaining: 3.79s
629:	learn: 0.0080202	total: 6.43s	remaining: 3.78s
630:	learn: 0.0080091	total: 6.44s	remaining: 3.77s
631:	learn: 0.0079978	total: 6.45s	remaining: 3.75s
632:	learn: 0.0079828	total: 6.46s	remaining: 3.74s
633:	learn: 0.0079643	total: 6.46s	remaining: 3.73s