In [228]:

import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import itertools


In [229]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
names = ['class','Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 
         'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
         'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
dataframe = pandas.read_csv(url, names=names)


In [230]:
dataframe.head()

Unnamed: 0,class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [231]:
array = dataframe.values
X = array[:,1:14]
Y = array[:,0]
X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=None)
results = []

**Logistic Regression**

In [240]:
logistic_clf = LogisticRegression()
logistic_clf.fit(X_train, Y_train)
logistic_clf_predictions_train = logistic_clf.predict(X_train)
logistic_clf_predictions_test = logistic_clf.predict(X_test)
logistic_clf_score = logistic_clf.score(X_train,Y_train)

## K-Fold Cross-Validation
logistic_kfold = model_selection.KFold(n_splits=10, random_state=7)
logistic_cv_results = model_selection.cross_val_score(logistic_clf, X_train, Y_train, cv=logistic_kfold, 
                                                      scoring='accuracy')
results.append(logistic_cv_results)


Accuracy: 0.96 (+/- 0.07)


** K Nearest Neighbours **

In [233]:

KNN_clf = KNeighborsClassifier()
KNN_clf.fit(X_train, Y_train)
KNN_clf_predictions_train = KNN_clf.predict(X_train)
KNN_clf_predictions_test = KNN_clf.predict(X_test)
KNN_clf_score = KNN_clf.score(X_train,Y_train)

## K-Fold Cross-Validation
KNN_kfold = model_selection.KFold(n_splits=10, random_state=7)
KNN_cv_results = model_selection.cross_val_score(KNN_clf, X_train, Y_train, cv=KNN_kfold, scoring='accuracy')
results.append(KNN_cv_results)

**Neural Networks**

In [239]:
MLPClassifier_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(9, 6), random_state=1)
MLPClassifier_clf.fit(X_train, Y_train)
MLPClassifier_clf_predictions_train = MLPClassifier_clf.predict(X_train)
MLPClassifier_clf_predictions_test = MLPClassifier_clf.predict(X_test)
MLPClassifier_clf_score = MLPClassifier_clf.score(X_train,Y_train)

## K-Fold Cross-Validation
MLPClassifier_kfold = model_selection.KFold(n_splits=10, random_state=7)
MLPClassifier_cv_results = model_selection.cross_val_score(MLPClassifier_clf, 
                                                           X_train, Y_train, 
                                                           cv=MLPClassifier_kfold, 
                                                           scoring='accuracy')

array([ 0.35714286,  0.64285714,  0.71428571,  0.61538462,  0.84615385,
        0.84615385,  0.69230769,  0.69230769,  0.69230769,  0.69230769])

**Performance Comparison**

In [241]:
print("Results for Logistic Regression classifier")
print("Accuracy on the train data is " + str(np.mean(logistic_clf_predictions_train == Y_train)*100) + "%")
print("Accuracy on the test data is " + str(np.mean(logistic_clf_predictions_test == Y_test)*100) + "%")
print("Accuracy of the Cross-Validation: %0.2f (+/- %0.2f)" % (logistic_cv_results.mean(), 
                                                               logistic_cv_results.std() * 2))

print("Results for K Nearest Neighbour")
print("Accuracy on the train data is " + str(np.mean(KNN_clf_predictions_train == Y_train)*100) + "%")
print("Accuracy on the test data is " + str(np.mean(KNN_clf_predictions_test == Y_test)*100) + "%")
print("Accuracy of the Cross-Validation: %0.2f (+/- %0.2f)" % (KNN_cv_results.mean(), 
                                                               KNN_cv_results.std() * 2))

print("Results for Neural Networks")
print("Accuracy on the train data is " + str(np.mean(MLPClassifier_clf_predictions_train == Y_train)*100) + "%")
print("Accuracy on the test data is " + str(np.mean(MLPClassifier_clf_predictions_test == Y_test)*100) + "%")
print("Accuracy of the Cross-Validation: %0.2f (+/- %0.2f)" % (MLPClassifier_cv_results.mean(), 
                                                               MLPClassifier_cv_results.std() * 2))


Results for Logistic Regression classifier
Accuracy on the train data is 98.4962406015%
Accuracy on the test data is 93.3333333333%
Accuracy of the Cross-Validation: 0.96 (+/- 0.07)
Results for K Nearest Neighbour
Accuracy on the train data is 76.6917293233%
Accuracy on the test data is 64.4444444444%
Accuracy of the Cross-Validation: 0.72 (+/- 0.20)
Results for Neural Networks
Accuracy on the train data is 72.1804511278%
Accuracy on the test data is 60.0%
Accuracy of the Cross-Validation: 0.68 (+/- 0.26)


**Tuning the neural network model**

In [242]:
#Grid Search
parameters = {'solver':['lbfgs'], 'alpha':[1e-5], 'random_state':[1],'hidden_layer_sizes':
              list(itertools.product([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))}
nn = MLPClassifier()
neural_net_grid = GridSearchCV(nn, parameters)
neural_net_grid.fit(X_train, Y_train)
neural_net_grid_pred_train = neural_net_grid.predict(X_train)
neural_net_grid_pred_test = neural_net_grid.predict(X_test)
print("Results for Neural Networks")
print("Accuracy on the train data is " + str(np.mean(neural_net_grid_pred_train == Y_train)*100) + "%")
print("Accuracy on the train data is " + str(np.mean(neural_net_grid_pred_test == Y_test)*100) + "%")
print("The best estimator for the hidden layers is - " + str(neural_net_grid.best_estimator_))

Results for Neural Networks
Accuracy on the train data is 94.7368421053%
Accuracy on the train data is 88.8888888889%
The best estimator for the hidden layers is - MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [237]:
print("Results for tuned Neural Network")
print("Accuracy on the train data is " + str(np.mean(neural_net_grid_pred_train == Y_train)*100) + "%")
print(classification_report(y_true=Y_train, y_pred=neural_net_grid_pred_train))
print("Accuracy on the test data is " + str(np.mean(neural_net_grid_pred_test == Y_test)*100) + "%")
print(classification_report(y_true=Y_test, y_pred=neural_net_grid_pred_test))
print("Confusion Matrix for Train Data")
print(confusion_matrix(Y_train, neural_net_grid_pred_train))
print("Confusion Matrix for Test Data")
print(confusion_matrix(Y_test, neural_net_grid_pred_test))
print("--------------------------------------------------------------------------------------------------")


Results for tuned Neural Network
Accuracy on the train data is 94.7368421053%
             precision    recall  f1-score   support

        1.0       0.97      0.88      0.93        43
        2.0       0.93      0.97      0.95        58
        3.0       0.94      1.00      0.97        32

avg / total       0.95      0.95      0.95       133

Accuracy on the test data is 88.8888888889%
             precision    recall  f1-score   support

        1.0       0.93      0.81      0.87        16
        2.0       0.92      0.85      0.88        13
        3.0       0.84      1.00      0.91        16

avg / total       0.89      0.89      0.89        45

Confusion Matrix for Train Data
[[38  4  1]
 [ 1 56  1]
 [ 0  0 32]]
Confusion Matrix for Test Data
[[13  1  2]
 [ 1 11  1]
 [ 0  0 16]]
--------------------------------------------------------------------------------------------------
