In [3]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC


In [6]:
# read data:

data_np = np.loadtxt("winequality-white.csv", dtype=float, delimiter=";", skiprows=1, ndmin=2)

print(data_np.shape)
print(data_np[:3])

(4898, 12)
[[7.000e+00 2.700e-01 3.600e-01 2.070e+01 4.500e-02 4.500e+01 1.700e+02
  1.001e+00 3.000e+00 4.500e-01 8.800e+00 6.000e+00]
 [6.300e+00 3.000e-01 3.400e-01 1.600e+00 4.900e-02 1.400e+01 1.320e+02
  9.940e-01 3.300e+00 4.900e-01 9.500e+00 6.000e+00]
 [8.100e+00 2.800e-01 4.000e-01 6.900e+00 5.000e-02 3.000e+01 9.700e+01
  9.951e-01 3.260e+00 4.400e-01 1.010e+01 6.000e+00]]


In [7]:
# train - test split the data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = data_np[:,:-1]
print(X.shape)
y = data_np[:,-1]
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=11)



(4898, 11)
(4898,)


In [8]:
# use linear svc:

svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=100, loss="hinge"))
    ])

svm_clf.fit(X_train,y_train)
y_pred = svm_clf.predict(X_test)

print(f"accuracy_score: {accuracy_score(y_test, y_pred):.2}")

# Output:
# ("linear_svc", LinearSVC(C=100, loss="hinge"))
# >>> accuracy_score: 0.47
# >>> 0.6s

accuracy_score: 0.47




In [9]:
# use polynomial features:
from sklearn.preprocessing import PolynomialFeatures

svm_poly_clf = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    ("svm_linear_clf", LinearSVC(C=10, loss="hinge"))
])

svm_poly_clf.fit(X_train, y_train)
y_pred = svm_poly_clf.predict(X_test)

print(f"accuracy_score: {accuracy_score(y_test, y_pred):.2}")

# Output:
# ("poly_features", PolynomialFeatures(degree=3)),
# ("svm_linear_clf", LinearSVC(C=10, loss="hinge"))
# >>> accuracy_score: 0.5
# >>> 8.3s

accuracy_score: 0.44




In [61]:
# use kernel-trick and higher order polynomial features:

svm_poly_kernel_clf=Pipeline([
    ("scaler", StandardScaler()),
    ("kernel_SVC", SVC(kernel="poly", degree=6, coef0=1, C=10))
])

svm_poly_kernel_clf.fit(X_train, y_train)
y_pred= svm_poly_kernel_clf.predict(X_test)

print(f"accuracy_score: {accuracy_score(y_true=y_test, y_pred=y_pred):.2}")

# Output:
#  ("kernel_SVC", SVC(kernel="poly", degree=6, coef0=1, C=10))
# >>> accuracy_score: 0.64
# >>> 2.8s

accuracy_score: 0.64


In [33]:
# load train and test set for use in "Wine_StackedVoting.ipynb":

X_train = np.loadtxt("train.csv", dtype=float, delimiter=",", skiprows=1, ndmin=2)
y_train = X_train[:,-1]
X_train = X_train[:, :-1]

X_test = np.loadtxt("test.csv", dtype=float, delimiter=",", skiprows=1, ndmin=2)
y_test = X_test[:,-1]
X_test = X_test[:, :-1]

In [34]:
# use Gaussian RBF:

svm_rbf_kernel_clf= Pipeline([
    ("scaler", StandardScaler()),
    ("rbf_kernel_SVC", SVC(kernel="rbf", gamma=1, C=50))
])

svm_rbf_kernel_clf.fit(X_train, y_train)
y_pred= svm_rbf_kernel_clf.predict(X_test)

print(f"accuracy_score: {accuracy_score(y_pred=y_pred, y_true=y_test):.2}")

# Output:
# ("rbf_kernel_SVC", SVC(kernel="rbf", gamma=1, C=50))
# accuracy_score: 0.68
# 1.2s  

accuracy_score: 0.67


In [38]:
# save svm classifier:
import pickle

filename="svm_rbf_kernel_0.67_G1_C50.svm"
pickle.dump(svm_rbf_kernel_clf, open(filename, "wb"))

# Test dumping by reloading:

reloaded_svm = pickle.load(open(filename, "rb"))
print(f"reloaded_svm prediction: {reloaded_svm.predict(X_test[0].reshape(1,-1))}")


reloaded_svm prediction: [8.]
