# Bus241 datasets: Wine quality data example
* Source:  ML repository
* Wine quality by experts (0 - 10)
* Predictors:  Wine chemical composition
* This can be done as either classification, or regression


## Load tools

In [3]:
%matplotlib inline
# Import lots of tools
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

In [4]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

## Load and process data

In [5]:
# load default data set
wineall = pd.read_csv("winequality-red.csv")
# print(wineall.shape)
print(wineall.head())
X = wineall.values[:,0:11].copy()
y = wineall.quality.values
print(X.shape)
print([x for x in wineall])

   fixedAcidity  volatileAcidity  citricAcid  residualSugar  chlorides  \
0           7.4             0.70        0.00            1.9      0.076   
1           7.8             0.88        0.00            2.6      0.098   
2           7.8             0.76        0.04            2.3      0.092   
3          11.2             0.28        0.56            1.9      0.075   
4           7.4             0.70        0.00            1.9      0.076   

   freeSulfurDioxide  totalSulfurDioxide  density    pH  sulphates  alcohol  \
0               11.0                34.0   0.9978  3.51       0.56      9.4   
1               25.0                67.0   0.9968  3.20       0.68      9.8   
2               15.0                54.0   0.9970  3.26       0.65      9.8   
3               17.0                60.0   0.9980  3.16       0.58      9.8   
4               11.0                34.0   0.9978  3.51       0.56      9.4   

   quality  
0        5  
1        5  
2        5  
3        6  
4        5  
(1

# Training Set and Test Set

In [6]:
# print(wineall.iloc[:10, :13])
size = int(len(wineall)* 0.8)
Train_X = wineall.iloc[:size, :11]
Test_X = wineall.iloc[size:, :11]
Train_Y = wineall.iloc[:size, 11]
Test_Y = wineall.iloc[size:, 11]


# 3 Regression Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=5, random_state=0)
tree.fit(Train_X, Train_Y)

print("Accuracy on training set: {:.3f}".format(tree.score(Train_X, Train_Y)))
print("Accuracy on test set: {:.3f}".format(tree.score(Test_X, Test_Y)))


Accuracy on training set: 0.679
Accuracy on test set: 0.581


# 4 Lasso & Ridge

In [8]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# ridge
ridge = Ridge(alpha=0.05)
ridge.fit(Train_X, Train_Y)

print("Ridge Train: ", ridge.score(Train_X,Train_Y))
print("Ridge Test: ", ridge.score(Test_X,Test_Y))

# lasso
lasso = Lasso(alpha=0.05)
lasso.fit(Train_X, Train_Y)
print("Lasso Train: ", lasso.score(Train_X,Train_Y))
print("Lasso Test: ", lasso.score(Test_X, Test_Y))

Ridge Train:  0.3693102258825097
Ridge Test:  0.28741462976163346
Lasso Train:  0.2788392551518688
Lasso Test:  0.1156891074875579


# 5 LinearSVC

In [9]:
from sklearn.svm import LinearSVC

# LinearSVCMod = LinearSVC(C=10000.)
LinearSVCMod = LinearSVC(C=10., random_state=2, max_iter=10000)
LinearSVCMod.fit(Train_X,Train_Y)
print('Linear SVC Train accuracy:', LinearSVCMod.score(Train_X,Train_Y))
print('Linear SVC Test accuracy:', LinearSVCMod.score(Test_X,Test_Y))
print('fraction y = 1:', np.mean(y))

LinearSVCMod = LinearSVC(C=10., random_state=4, max_iter=10000)
LinearSVCMod.fit(Train_X,Train_Y)
print('Linear SVC Train accuracy:', LinearSVCMod.score(Train_X,Train_Y))
print('Linear SVC Test accuracy:', LinearSVCMod.score(Test_X,Test_Y))
print('fraction y = 1:', np.mean(y))

LinearSVCMod = LinearSVC(C=10., random_state=6, max_iter=10000)
LinearSVCMod.fit(Train_X,Train_Y)
print('Linear SVC Train accuracy:', LinearSVCMod.score(Train_X,Train_Y))
print('Linear SVC Test accuracy:', LinearSVCMod.score(Test_X,Test_Y))
print('fraction y = 1:', np.mean(y))

Linear SVC Train accuracy: 0.5254104769351056
Linear SVC Test accuracy: 0.521875
fraction y = 1: 5.6360225140712945
Linear SVC Train accuracy: 0.5136825645035183
Linear SVC Test accuracy: 0.49375
fraction y = 1: 5.6360225140712945
Linear SVC Train accuracy: 0.49726348709929635
Linear SVC Test accuracy: 0.425
fraction y = 1: 5.6360225140712945


# 6 monte-carlo Comparing two methods

In [10]:
# Gradient Boosting 
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(n_estimators=100,max_depth=5,learning_rate=0.01)
cvf = ShuffleSplit(n_splits=25, test_size=0.25)
scores1 = cross_val_score(gbrt, Train_X, Train_Y, cv=cvf)
print("Gradient Boosting Mean Score of Training Set: ", np.mean(scores1))

# KNN 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
cvf = ShuffleSplit(n_splits=25, test_size=0.25)
scores2 = cross_val_score(knn, Train_X, Train_Y, cv=cvf)
print("KNN Mean Score:", np.mean(scores2))

print("Difference of Training Set: ", abs(np.mean(scores1) - np.mean(scores2)))

Gradient Boosting Mean Score of Training Set:  0.6325000000000001
KNN Mean Score: 0.508
Difference of Training Set:  0.12450000000000006


# 7 Random Forest vs Single Tree

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest
forest = RandomForestClassifier(n_estimators=20,max_features=4, max_depth=5)
cvf = ShuffleSplit(n_splits=25, test_size=0.25)
scores = cross_val_score(forest, Train_X, Train_Y, cv=cvf)
print("Random Forest: ", np.mean(scores))


# Single Tree: Decision Tree 
tree = DecisionTreeClassifier(max_depth=5, random_state=0)
tree.fit(Train_X, Train_Y)
print("Single Tree: {:.3f}".format(tree.score(Test_X, Test_Y)))

Random Forest:  0.61425
Single Tree: 0.581


# 8 Two ML Methods

In [14]:
# SVM with kernal
from sklearn.svm import SVC
svc = SVC(C=10,kernel='rbf',gamma=0.05)
svc.fit(Train_X, Train_Y)

print("Accuracy on training set: {:.3f}".format(svc.score(Train_X, Train_Y)))
print("Accuracy on test set: {:.3f}".format(svc.score(Test_X, Test_Y)))

# Naive Bayes With Gaussian
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(priors=None)
trainFit = gnb.fit(Train_X, Train_Y)
print("Accuracy on training set: ", trainFit.score(Train_X, Train_Y))
print("Accuracy on test set: ", trainFit.score(Test_X, Test_Y))


Accuracy on training set: 0.880
Accuracy on test set: 0.447
Accuracy on training set:  0.5684128225175918
Accuracy on test set:  0.6


# 9 SVM 

In [13]:
# C=1000
from sklearn.svm import SVC
svc = SVC(C=1000,kernel='rbf',gamma=1.)
svc.fit(Train_X, Train_Y)

# print("Accuracy on training set: {:.3f}".format(svc.score(Train_X, Train_Y)))
print("Parameters C=1000 + gamma=1 : {:.3f}".format(svc.score(Test_X, Test_Y)))

# C=10000
svc = SVC(C=10000,kernel='rbf',gamma=1.)
svc.fit(Train_X, Train_Y)

print("Parameters C=10000 + gamma=1 : {:.3f}".format(svc.score(Test_X, Test_Y)))

# SVM with kernal 3 
svc = SVC(C=1000,kernel='rbf',gamma=0.1)
svc.fit(Train_X, Train_Y)

print("Parameters C=1000 + gamma=0.1 : {:.3f}".format(svc.score(Test_X, Test_Y)))

# SVM with kernal 3 
svc = SVC(C=1000,kernel='rbf',gamma=0.01)
svc.fit(Train_X, Train_Y)

print("Parameters C=1000 + gamma=0.01 : {:.3f}".format(svc.score(Test_X, Test_Y)))


Parameters C=1000 + gamma=1 : 0.487
Parameters C=10000 + gamma=1 : 0.487
Parameters C=1000 + gamma=0.1 : 0.463
Parameters C=1000 + gamma=0.01 : 0.444


# 10 Neural Network

In [17]:
# MLP 1 
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=10)
mlp.fit(Train_X, Train_Y)

print("MLP 1 training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP 1 test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# MLP 2 
mlp = MLPClassifier(random_state=20)
mlp.fit(Train_X, Train_Y)

print("MLP 2 training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP 2 test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# MLP 3
mlp = MLPClassifier(random_state=30)
mlp.fit(Train_X, Train_Y)

print("MLP 3 training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP 3 test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# MLP 4
mlp = MLPClassifier(random_state=40)
mlp.fit(Train_X, Train_Y)

print("MLP 4 training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP 4 test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# MLP 5 
mlp = MLPClassifier(random_state=50)
mlp.fit(Train_X, Train_Y)

print("MLP 5 training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP 5 test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))




MLP 1 training set: 0.62
MLP 1 test set: 0.57
MLP 2 training set: 0.59
MLP 2 test set: 0.52
MLP 3 training set: 0.60
MLP 3 test set: 0.58




MLP 4 training set: 0.61
MLP 4 test set: 0.53
MLP 5 training set: 0.60
MLP 5 test set: 0.56


# 11 Width & Depth

In [18]:
# 1
mlp = MLPClassifier(max_iter=1000,hidden_layer_sizes=[20])
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))

# 2
mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=[40])
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# 3
mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=[60])
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# 4
mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=[20, 20])
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# 5
mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=[20, 20, 20])
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))


# 6
mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=[20, 20, 20, 20])
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))



MLP training set: 0.59
MLP test set: 0.57
MLP training set: 0.58
MLP test set: 0.56
MLP training set: 0.61
MLP test set: 0.58
MLP training set: 0.61
MLP test set: 0.59
MLP training set: 0.61
MLP test set: 0.57
MLP training set: 0.66
MLP test set: 0.58


# 12 Overfitting

In [19]:
# 1
mlp = MLPClassifier(solver='lbfgs', activation='tanh',
                    random_state=10, hidden_layer_sizes=[40, 40, 40, 40], 
                    max_iter=10000, alpha=0.001)
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))

# 2
mlp = MLPClassifier(solver='lbfgs', activation='tanh',
                    random_state=20, hidden_layer_sizes=[40, 40, 40, 40], 
                    max_iter=10000, alpha=0.001)
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))

# 3
mlp = MLPClassifier(solver='lbfgs', activation='tanh',
                    random_state=30, hidden_layer_sizes=[40, 40, 40, 40], 
                    max_iter=10000, alpha=0.001)
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))

# 4
mlp = MLPClassifier(solver='lbfgs', activation='tanh',
                    random_state=40, hidden_layer_sizes=[40, 40, 40, 40], 
                    max_iter=10000, alpha=0.001)
mlp.fit(Train_X, Train_Y)

print("MLP training set: {:.2f}".format(mlp.score(Train_X, Train_Y)))
print("MLP test set: {:.2f}".format(mlp.score(Test_X, Test_Y)))

MLP training set: 0.97
MLP test set: 0.50
MLP training set: 0.90
MLP test set: 0.49
MLP training set: 0.82
MLP test set: 0.57
MLP training set: 0.97
MLP test set: 0.50
