In [47]:

import pandas
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
from sklearn.model_selection import GridSearchCV
import itertools


In [48]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
names = ['class','Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 
         'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
         'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
dataframe = pandas.read_csv(url, names=names)


In [49]:
dataframe.head()

Unnamed: 0,class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [50]:
array = dataframe.values
X = array[:,1:14]
Y = array[:,0]
X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=None)

In [51]:
seed = 7
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('NeuralNetworks',MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)))

In [52]:

results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    cv_results2 = model_selection.cross_val_score(model, X_test, Y_test, cv=kfold, scoring=scoring)
    results.append(cv_results)
    results.append(cv_results2)
    names.append(name)
    msg = "%s: Train: %f (%f) Test: %f (%f)" % (name, cv_results.mean(), cv_results.std(), cv_results2.mean(), cv_results2.std())
    print(msg)

LR: Train: 0.961538 (0.062017) Test: 0.885000 (0.184459)
LDA: Train: 0.992857 (0.021429) Test: 0.905000 (0.161941)
KNN: Train: 0.720879 (0.109802) Test: 0.725000 (0.232648)
CART: Train: 0.901648 (0.059404) Test: 0.835000 (0.176139)
NB: Train: 0.969231 (0.051025) Test: 0.975000 (0.075000)
QDA: Train: 0.985165 (0.029696) Test: 0.395000 (0.221867)
RandomForest: Train: 0.976923 (0.035251) Test: 0.915000 (0.105000)
NeuralNetworks: Train: 0.441758 (0.154477) Test: 0.415000 (0.262726)


In [53]:

model = RandomForestClassifier()
parameters = [{"n_estimators": [250, 500, 1000]}]
# Returns the best configuration for a model using crosvalidation
# and grid search
clf = GridSearchCV(model, parameters,n_jobs=-1)
print(clf.fit(X,Y))       
print(clf.best_score_)                               
print(clf.best_estimator_)
cross_val_score(clf, X, Y)


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'n_estimators': [250, 500, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
0.949438202247
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False,
            rand

array([ 0.93333333,  0.91666667,  0.98275862])