In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Get our dataset

In [2]:

wine_data = load_wine()

In [3]:
dir(wine_data)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

# Look at its characteristics

In [4]:
print(wine_data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

In [5]:
features = wine_data.data
target = wine_data.target
wine_df = pd.DataFrame(features,columns=wine_data.feature_names)



Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


# Train our model

In [7]:
X_train, X_test,y_train,y_test = train_test_split(wine_df,wine_data.target,test_size=0.3,random_state=42)

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import KFold, cross_val_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost.sklearn import XGBClassifier

In [9]:
# testing all the ensemble methods available to sklearn
models=[]
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))
models.append(('ET', ExtraTreesClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('XGB',XGBClassifier()))

In [10]:
results = {}
accuracy = {}
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results[name] = (cv_results.mean(), cv_results.std())
    model.fit(X_train, y_train)
    _ = model.predict(X_test)
    accuracy[name] = accuracy_score(y_test, _)

In [11]:
results

{'AB': (0.887820512820513, 0.08883468309420073),
 'GBM': (0.9198717948717947, 0.05081922557815023),
 'ET': (0.9762820512820513, 0.049987671063799755),
 'RF': (0.9685897435897436, 0.051637390700375194),
 'XGB': (0.9525641025641025, 0.06222903127954666)}

In [12]:
accuracy

{'AB': 0.9259259259259259,
 'GBM': 0.9074074074074074,
 'ET': 1.0,
 'RF': 1.0,
 'XGB': 0.9444444444444444}

In [13]:
import matplotlib.pyplot as plt
best_model = ExtraTreesClassifier()
best_model.fit(X_train, y_train)
feature_importance = best_model.feature_importances_
 
# Normalizing the individual importances
feature_importance_normalized = np.std([tree.feature_importances_ for tree in
                                        best_model.estimators_],
                                        axis = 0)


alcohol 0.10004141812906875
malic_acid 0.05503066785868711
ash 0.02446827568899026
alcalinity_of_ash 0.05237086622935617
magnesium 0.04029384421338456
total_phenols 0.0802756666852924
flavanoids 0.12438061438946629
nonflavanoid_phenols 0.023044905791050067
proanthocyanins 0.04861831300381969
color_intensity 0.11633632327962189
hue 0.08122441236113437
od280/od315_of_diluted_wines 0.10256708362100582
proline 0.1097889025902571


# Persist Model

In [14]:
import joblib


joblib.dump(best_model, 'extratrees.joblib') 

['extratrees.joblib']