In [49]:
using HDF5
# Python Code source: Gaël Varoquaux
#                     Andreas Müller
# Julia adaptation: Cédric St-Jean
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

using ScikitLearn
using PyCall
using PyPlot
using ScikitLearn.CrossValidation: train_test_split
@pyimport matplotlib.colors as mplc

@sk_import decomposition: PCA
@sk_import preprocessing: (StandardScaler,MinMaxScaler)
@sk_import datasets: (make_moons, make_circles, make_classification)
@sk_import neighbors: KNeighborsClassifier
@sk_import svm: SVC
@sk_import tree: DecisionTreeClassifier
@sk_import ensemble: (RandomForestClassifier, AdaBoostClassifier)
@sk_import naive_bayes: GaussianNB
@sk_import discriminant_analysis: (LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis)
@sk_import decomposition: KernelPCA
@sk_import preprocessing: (LabelEncoder, OneHotEncoder)
@sk_import linear_model: LogisticRegression
@sk_import metrics: classification_report
@sk_import feature_selection: SelectKBest
@sk_import metrics: roc_curve

using ScikitLearn.Utils: meshgrid
using ScikitLearn, PyPlot
using ScikitLearn.GridSearch: GridSearchCV
using ScikitLearn.Pipelines: Pipeline, named_steps, FeatureUnion



In [2]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Linear Discriminant Analysis",
    "Quadratic Discriminant Analysis", "LogisticRegression"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.5),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=15),
    RandomForestClassifier(max_depth=15, n_estimators=40, max_features=1),
    AdaBoostClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(C=100)]

# dimred = KernelPCA(kernel="poly")
dimred = PCA(n_components = 0.9999)
# lenc = OneHotEncoder()
X_ = h5read("out/isc|res_x.h5", "data")'
y_ = h5read("out/isc|res_y.h5", "data")
# X_ = h5read("out/rnd|res_x.h5", "data")'
# y_ = h5read("out/rnd|res_y.h5", "data")
map!(x->isnan(x)?0:x,X_);

In [None]:
lenc = LabelEncoder()
y2 = fit_transform!(lenc, y_)
# y2 = map(x->x[1] == 'V',y_)

combined_features = FeatureUnion([("pca", PCA(n_components = 700)), ("univ_select", SelectKBest(k = 30))])
pip = Pipeline([("features", combined_features),("scale",MinMaxScaler()),("svc", SVC(C=4000))])
top = GridSearchCV(pip, Dict("svc__C"=>[4000]))

scor = 0.
    X_train, X_test, y_train, y_test = train_test_split(X_, y2, test_size=.2)
    fit!(top, X_train, y_train)
    scor = score(top, X_test, y_test)    

y_pred = predict(top, X_test)
print(classification_report(y_test, y_pred))
scor

([0.0,0.122905,1.0],[0.0,0.942424,1.0],[2,1,0])

In [None]:
4000
                precision    recall  f1-score   support

          0       0.77      0.69      0.73        90
          1       0.69      0.80      0.74        87
          2       0.72      0.67      0.70        85
          3       0.67      0.63      0.65        82
          4       0.73      0.65      0.69        84
          5       0.59      0.68      0.63        81

avg / total       0.69      0.69      0.69       509

3900

             precision    recall  f1-score   support

          0       0.65      0.71      0.68        72
          1       0.72      0.70      0.71       109
          2       0.58      0.76      0.66        58
          3       0.71      0.64      0.67        88
          4       0.81      0.73      0.77        85
          5       0.67      0.65      0.66        97

avg / total       0.70      0.69      0.69       509


In [4]:
y_pred = predict(ests[findmax(sc)[2]], X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.95      0.82      0.88       170
       True       0.91      0.98      0.94       339

avg / total       0.92      0.92      0.92       509


In [48]:
tr = SelectKBest(k = 2)
yb = map(x->x[1] == 'V',y_)
Xkb = fit_transform!(tr, X_, yb)
sc = scatter(Xkb[:,1],Xkb[:,2], c = Int.(yb))
legend(sc,("S","V"))

LoadError: PyError (:PyObject_Call) <class 'TypeError'>
TypeError("'PathCollection' object is not iterable",)
  File "/home/ashedko/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py", line 3798, in legend
    ret = gca().legend(*args, **kwargs)
  File "/home/ashedko/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py", line 564, in legend
    self.legend_ = mlegend.Legend(self, handles, labels, **kwargs)
  File "/home/ashedko/anaconda3/lib/python3.6/site-packages/matplotlib/legend.py", line 283, in __init__
    handles = list(handles)


In [3]:
# Grid search
Cs = logspace(3,4.5, 4)
shrinks = linspace(.6,1,8)

pca = PCA()

# Maybe some original features where good, too?
selection = SelectKBest()

# Build estimator from PCA and Univariate selection:

combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

feat_dict = Dict(
                "features__pca__n_components"=>[700,1000,1300],
                  "features__univ_select__k"=>[30,40]
                )

nms = ["Regr","LDA","SVM"]

pipes = [Pipeline([("features", combined_features),("scale",StandardScaler()),("log", LogisticRegression())]),
         Pipeline([("features", combined_features),("scale",StandardScaler()),("lda", LinearDiscriminantAnalysis())]),
         Pipeline([("features", combined_features),("scale",StandardScaler()),("svc", SVC(C=1.0))])
        ]

ests = [GridSearchCV(pipes[1], merge(Dict("log__C"=> Cs),feat_dict)),
        GridSearchCV(pipes[2], feat_dict),
        GridSearchCV(pipes[3], merge(Dict("svc__C"=> Cs),feat_dict))
        ]

# preprocess dataset, split into training and test part

y = map(x-> x[1]=='V', y_);

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=.2)
sc = Float64[]

for (name,est) in zip(nms,ests)
    fit!(est,X_train, y_train)    
    scor = score(est, X_test, y_test)    
    print("$name \t $scor\n")
    display(est.best_estimator_)
    push!(sc, scor)
end

print("Winner is: $(findmax(sc)[2])")

y_pred = predict(ests[findmax(sc)[2]], X_test)
print(classification_report(y_test, y_pred))

ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("features",ScikitLearn.Skcore.FeatureUnion(Tuple{Any,Any}[("pca",PyObject PCA(copy=True, iterated_power='auto', n_components=700.0, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)),("univ_select",PyObject SelectKBest(k=30.0, score_func=<function f_classif at 0x7f2ba8ea9400>))],1,nothing)),("scale",PyObject StandardScaler(copy=True, with_mean=True, with_std=True)),("log",PyObject LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))],Any[ScikitLearn.Skcore.FeatureUnion(Tuple{Any,Any}[("pca",PyObject PCA(copy=True, iterated_power='auto', n_components=700.0, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)),("univ_select",PyObject SelectKBest(k=30.0, score_func=<function f_classif at 0x7f2ba8ea9400>))],1,n

  Q = random_state.normal(size=(A.shape[1], size))
  return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  Q = random_state.normal(size=(A.shape[1], size))
  return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  Q = random_state.normal(size=(A.shape[1], size))
  return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  Q = random_state.normal(size=(A.shape[1], size))
  return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  Q = random_state.normal(size=(A.shape[1], size))
  return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  Q = random_state.normal(size=(A.shape[1], size))
  return V[:n_components, :].T, s[:n_co

Regr 	 0.8958742632612967


ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("features",ScikitLearn.Skcore.FeatureUnion(Tuple{Any,Any}[("pca",PyObject PCA(copy=True, iterated_power='auto', n_components=700, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)),("univ_select",PyObject SelectKBest(k=40, score_func=<function f_classif at 0x7f2ba8ea9400>))],1,nothing)),("scale",PyObject StandardScaler(copy=True, with_mean=True, with_std=True)),("lda",PyObject LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001))],Any[ScikitLearn.Skcore.FeatureUnion(Tuple{Any,Any}[("pca",PyObject PCA(copy=True, iterated_power='auto', n_components=700, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)),("univ_select",PyObject SelectKBest(k=40, score_func=<function f_classif at 0x7f2ba8ea9400>))],1,nothing),PyObject StandardScaler(copy=True, with_mean=True, with_std=True),PyObject LinearDiscriminantAnalysis(n_components=None, priors=N

LDA 	 0.9037328094302554


ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("features",ScikitLearn.Skcore.FeatureUnion(Tuple{Any,Any}[("pca",PyObject PCA(copy=True, iterated_power='auto', n_components=700.0, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)),("univ_select",PyObject SelectKBest(k=30.0, score_func=<function f_classif at 0x7f2ba8ea9400>))],1,nothing)),("scale",PyObject StandardScaler(copy=True, with_mean=True, with_std=True)),("svc",PyObject SVC(C=3162.2776601683795, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],Any[ScikitLearn.Skcore.FeatureUnion(Tuple{Any,Any}[("pca",PyObject PCA(copy=True, iterated_power='auto', n_components=700.0, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)),("univ_select",PyObject SelectKBest(k=30.0, score_func=<function f_classif at 0x7f2ba8ea9400>))],1,nothing),PyObject StandardScal

he future
  explained_variance_ratio_[:n_components]
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  self.noise_variance_ = explained_variance_[n_components:].mean()
  self.components_ = components_[:n_components]
  self.explained_variance_ = explained_variance_[:n_components]
  explained_variance_ratio_[:n_components]
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  self.noise_variance_ = explained_variance_[n_components:].mean()
  self.components_ = components_[:n_components]
  self.explained_variance_ = explained_variance_[:n_components]
  explained_variance_ratio_[:n_components]
  mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
  self.noise_variance_ = explained_variance_[n_components:].mean()
  self.components_ = components_[:n_components]
  self.explained_variance_ = explained_variance_[:n_components]
  explained_variance_ratio_[:n_components]
  self.explained_variance_ = explained_variance_[:n_components]
  explained_variance_ratio_[:n_component

SVM 	 0.9233791748526523
Winner is: 3

LoadError: PyError (:PyObject_Call) <class 'sklearn.exceptions.NotFittedError'>
NotFittedError("This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.",)
  File "/home/ashedko/anaconda3/lib/python3.6/site-packages/sklearn/svm/base.py", line 573, in predict
    y = super(BaseSVC, self).predict(X)
  File "/home/ashedko/anaconda3/lib/python3.6/site-packages/sklearn/svm/base.py", line 310, in predict
    X = self._validate_for_predict(X)
  File "/home/ashedko/anaconda3/lib/python3.6/site-packages/sklearn/svm/base.py", line 457, in _validate_for_predict
    check_is_fitted(self, 'support_')
  File "/home/ashedko/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py", line 690, in check_is_fitted
    raise _NotFittedError(msg % {'name': type(estimator).__name__})


In [None]:
LogisticRegression
Dict{Symbol,Any} with 3 entries:
  :features__pca__n_components => 700.0
  :log__C                      => 1000.0
  :features__univ_select__k    => 30.0
LDA
Dict{Symbol,Any} with 2 entries:
  :features__pca__n_components => 700
  :features__univ_select__k    => 40
SVM classifier
Dict{Symbol,Any} with 3 entries:
  :features__pca__n_components => 700.0
  :svc__C                      => 3162.28
  :features__univ_select__k    => 30.0

In [None]:
X = fit_transform!(dimred, X_)
# y = fit_transform!(lenc, y);

# preprocess dataset, split into training and test part
X = fit_transform!(StandardScaler(), X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
sc = Float64[]
# iterate over classifiers
for (name, clf) in zip(names, classifiers)
    fit!(clf, X_train, y_train)
    scor = score(clf, X_test, y_test)    
    print("$name \t& $scor\n")
    push!(sc, scor)
end

y_pred = predict(classifiers[findmax(sc)[2]], X_test)
print(classification_report(y_test, y_pred))

Nearest Neighbors 	& 0.46194225721784776

Linear SVM 	& 0.6115485564304461

RBF SVM 	& 0.5564304461942258

Decision Tree 	& 0.55249343832021

Random Forest 	& 0.55249343832021

AdaBoost 	& 0.3123359580052493

Naive Bayes 	& 0.3083989501312336

Linear Discriminant Analysis 	& 0.636482939632546

Quadratic Discriminant Analysis 	& 0.589238845144357

LogisticRegression 	& 0.6286089238845144

| |   precision  |  recall | f1-score | support|
|----------------------------------------------------|
 |   S1 |      0.71|      0.74|      0.72|       120|
  |     S2 |      0.68 |     0.71   |   0.69 |      143|
   |    V1  |     0.67  |    0.60    |  0.63  |     126|
    |    V2  |     0.53  |    0.52    |  0.52  |     121|
     |   V3   |    0.66   |   0.63  |    0.64   |    139|
      | V4     |  0.56     | 0.62    |  0.59     |  113|
|avg / total    |   0.64    |  0.64   |   0.64    |   762|

In [None]:
Nearest Neighbors 	& 0.43717277486910994
Linear SVM 	& 0.6164921465968587
RBF SVM 	& 0.569371727748691
Decision Tree 	& 0.518324607329843
Random Forest 	& 0.5719895287958116
AdaBoost 	& 0.3023560209424084
Naive Bayes 	& 0.32329842931937175
Linear Discriminant Analysis 	& 0.6230366492146597
Quadratic Discriminant Analysis 	& 0.587696335078534
LogisticRegression 	& 0.6191099476439791
             precision    recall  f1-score   support

         S1       0.60      0.59      0.59       119
         S2       0.71      0.75      0.73       148
         V1       0.66      0.60      0.63       132
         V2       0.55      0.50      0.52       120
         V3       0.59      0.68      0.64       133
         V4       0.61      0.58      0.59       112

avg / total       0.62      0.62      0.62       764

In [None]:
pairs = collect(zip(y_test, y_pred));
eqfst = map(x-> x[1][1]==x[2][1], pairs);
countnz(eqfst)/length(eqfst)

In [45]:
# Plots.histogram(squeeze(var(X_,1),1))
# top = ests[3].best_estimator_
top = pipes[3].set_params(Dict(
  :features__pca__n_components => 700.0
  :svc__C                      => 3162.28
  :features__univ_select__k    => 30.0))

LoadError: type Pipeline has no field set_params

In [27]:
(x->display(x.best_params_)).(ests)

Dict{Symbol,Any} with 3 entries:
  :features__pca__n_components => 700.0
  :log__C                      => 1000.0
  :features__univ_select__k    => 30.0

Dict{Symbol,Any} with 2 entries:
  :features__pca__n_components => 700
  :features__univ_select__k    => 40

Dict{Symbol,Any} with 3 entries:
  :features__pca__n_components => 700.0
  :svc__C                      => 3162.28
  :features__univ_select__k    => 30.0

3-element Array{Void,1}:
 nothing
 nothing
 nothing

In [None]:
Nearest Neighbors 	& 0.4671916010498688
Linear SVM 	& 0.6509186351706037
RBF SVM 	& 0.6023622047244095
Decision Tree 	& 0.594488188976378
Random Forest 	& 0.6023622047244095
AdaBoost 	& 0.31627296587926507
Naive Bayes 	& 0.31496062992125984
Linear Discriminant Analysis 	& 0.6601049868766404
Quadratic Discriminant Analysis 	& 0.5997375328083989
LogisticRegression 	& 0.652230971128609
             precision    recall  f1-score   support

         S1       0.68      0.75      0.71       128
         S2       0.75      0.65      0.69       141
         V1       0.67      0.66      0.67       141
         V2       0.54      0.57      0.55       115
         V3       0.76      0.66      0.71       131
         V4       0.56      0.67      0.61       106

avg / total       0.67      0.66      0.66       762