In [1]:
#------------------------------------<imports and functions definition>-------------------------------------------------#

import libs
import sklearn as sk
from sklearn import neighbors
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out



def do_svm(kernel_type):
    C=[0.001, 0.01, 0.1, 1, 10, 100,1000]
    svm_scores=[0,0,0,0,0,0,0]
    svm_max_score = 0
    best_C = 0
    best_svm_clf = None
    svm_sc_count = 0
    for c in C:
        model = svm.SVC(C=c, kernel=kernel_type, gamma="auto")
        svm_clf = model.fit(X_train, y_train)
        fig, ax = plt.subplots()
        # title for the plots
        title = ('SVC data and boundaries for C= %2f ' %(c))
        # Set-up grid for plotting.
        X0, X1 = X_train[:, 0], X_train[:, 1]
        xx, yy = make_meshgrid(X0, X1)
        plot_contours(ax, svm_clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
        ax.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
        ax.set_ylabel('y label here')
        ax.set_xlabel('x label here')
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(title)
        plt.show()
        score = svm_clf.score(X_validate, y_validate)
        svm_scores[svm_sc_count] = score
        svm_sc_count += 1
        if score > svm_max_score:
            best_C = c
            svm_max_score = score
            best_svm_clf = svm_clf
    # Plot a graph showing how the accuracy on the validation set varies when changing C
    svm_objects = ('C=0.001', 'C=0.01', 'C=0.1', 'C=1','C=10', 'C=100', 'C=1000')
    y_pos = np.arange(len(svm_objects))
    plt.bar(y_pos, svm_scores, align='center', alpha=0.5)
    plt.xticks(y_pos, svm_objects)
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs C')
    plt.show()
    svm_test_accuracy = best_svm_clf.score(X_test, y_test)
    print("Best C: %2f, max clf accuracy: %2f, accuracy on test: %2f" %(best_C, svm_max_score, svm_test_accuracy))
    
def do_svm_params(kernel_type, c, gamma):
    model = svm.SVC(C=c, kernel=kernel_type, gamma=gamma)
    svm_clf = model.fit(X_train, y_train)
    fig, ax = plt.subplots()
    # title for the plots
    title = ('SVC data and boundaries for C= %2f ' %(c))
    # Set-up grid for plotting.
    X0, X1 = X_train[:, 0], X_train[:, 1]
    xx, yy = make_meshgrid(X0, X1)
    plot_contours(ax, svm_clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_ylabel('y label here')
    ax.set_xlabel('x label here')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)
    plt.show()
    score = svm_clf.score(X_test, y_test)
    print("Accuracy for K-fold: %2f" %(score))


#--------------------------------------------<DB ANALYSIS AND SAMPLES>--------------------------------------------------#

# show dataset
X, y = load_wine(return_X_y=True) # X contains values for attributes, y contains class labels

# rearrange dataset
X = X[:,:2] # now X is a 2D dataset

#---------------------------------------------------<K-NEIGHBORS>-------------------------------------------------------#

# K-neighbors parameters
kn_values = [1,3,5,7]

# create samples
train_size, validate_size, test_size = round(len(X) * 0.5), round(len(X) * 0.2), round(len(X) * 0.3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)
scaler = StandardScaler()
scaler.fit(X_train) # here validation is in the whole dataset so we can fit scalar properly
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=validate_size, random_state=1)
X_train = scaler.transform(X_train)
X_validate=scaler.transform(X_validate)
X_test=scaler.transform(X_test)

# color maps parameters
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# analysis
sc_count = 0
scores = [0,0,0,0]
max_score = 0
best_k = 0
best_clf = None
for kn in kn_values:
    neigh = neighbors.KNeighborsClassifier(n_neighbors=kn)
    neigh.fit(X_train, y_train)
    X0, X1 = X_train[:, 0], X_train[:, 1]
    xx, yy = make_meshgrid(X0, X1)
    Z = neigh.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    # Plot also the training points
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, training samples = %i)" % (kn, train_size))
    plt.show()
    score = neigh.score(X_validate, y_validate)
    print("score: %2f, k: %2d" %(score, kn) )
    scores[sc_count] = score
    sc_count += 1
    if score >= max_score:
        best_k = kn
        max_score = score
        best_clf = neigh
# Plot a graph showing how the accuracy on the validation set varies when changing K
objects = ('k=1', 'k=3', 'k=5', 'k=7')
y_pos = np.arange(len(objects))
plt.bar(y_pos, scores, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy')
plt.title('Accuracy vs Kn')

plt.show()
test_accuracy = best_clf.score(X_test, y_test)
print("Best k: %2d, max clf accuracy: %2f, accuracy on test: %2f" %(best_k, max_score, test_accuracy))

#-------------------------------------------------------<SVM>-----------------------------------------------------------#

do_svm("linear")

#-----------------------------------------------------<SVM RBF>---------------------------------------------------------#

do_svm("rbf")

#-------------------------------------------------- <MANUAL-TUNING>-----------------------------------------------------#

gamma=[1e-3, 1e-4]
C=[0.001, 0.01, 0.1, 1, 10, 100,1000]
max_score=0
max_svm=None

for c in C:
    for g in gamma:
        model = svm.SVC(C=c, kernel="rbf", gamma=g)
        svm_clf = model.fit(X_train, y_train)
        score = svm_clf.score(X_validate, y_validate)
        if score > max_score:
            max_score = score
            max_svm=svm_clf

do_svm_params("rbf", max_svm.C, max_svm.gamma)

#------------------------------------------------------<K-FOLD>---------------------------------------------------------#

# tuning parameters

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [0.001, 0.01, 0.1, 1, 10, 100,1000]},
                    {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100,1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

np.concatenate((X_train, X_validate))
np.concatenate((y_train, y_validate))
do_svm_params("rbf", 100, 0.001)


TypeError: inner() got multiple values for argument 'ax'