### 1. Pipelines in scikit-learn

In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline

pipe_long = Pipeline([("scaler", MinMaxScaler), ("svm", SVC())])

pipe_short = make_pipeline(MinMaxScaler(), SVC())

In [2]:
pipe_short.steps

[('minmaxscaler', MinMaxScaler()), ('svc', SVC())]

In [3]:
from sklearn.preprocessing import Normalizer

pipe = make_pipeline(MinMaxScaler(), Normalizer(), MinMaxScaler())
pipe.steps

[('minmaxscaler-1', MinMaxScaler()),
 ('normalizer', Normalizer()),
 ('minmaxscaler-2', MinMaxScaler())]

In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)

pipe = make_pipeline(MinMaxScaler(), SVC())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.9790209790209791

In [5]:
from sklearn.model_selection import GridSearchCV
param_grid = {'svc__C': [0.01, 0.1, 1, 10, 100],
              'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test, y_test))
print("Best Parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9741450068399453
Test set score: 0.986013986013986
Best Parameters: {'svc__C': 10, 'svc__gamma': 0.1}


### 2. The KFold Class

In [6]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
iris = load_iris()
svm = SVC()
cross_val_score(svm, iris.data, iris.target)

array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])

In [7]:
from sklearn.model_selection import KFold
kf = KFold(shuffle=True, random_state=42)
kf

KFold(n_splits=5, random_state=42, shuffle=True)

In [8]:
kf.get_n_splits()

5

In [10]:
for rest_index, fold_index in kf.split(iris.data):
    print(rest_index)
    print(fold_index)
    X_rest, X_fold = iris.data[rest_index], iris.data[fold_index]
    y_rest, y_fold = iris.target[rest_index], iris.target[fold_index]
    svm.fit(X_rest, y_rest)
    print(svm.score(X_fold, y_fold))

[  0   1   2   3   4   5   6   7   8  10  11  13  14  15  16  17  20  21
  22  23  24  25  27  28  32  33  34  35  37  38  39  40  41  42  43  44
  46  47  48  49  50  51  52  53  54  57  58  59  60  61  62  63  65  66
  67  70  71  72  74  75  77  79  80  81  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 105 106 107 109 111
 112 113 114 115 116 117 119 120 121 122 123 124 125 126 129 130 133 134
 135 136 137 138 139 140 142 144 146 147 148 149]
[  9  12  18  19  26  29  30  31  36  45  55  56  64  68  69  73  76  78
  82 104 108 110 118 127 128 131 132 141 143 145]
1.0
[  1   2   3   5   6   7   8   9  12  13  14  17  18  19  20  21  23  24
  25  26  29  30  31  33  34  35  36  37  38  39  41  43  45  46  47  48
  49  50  52  53  54  55  56  57  58  59  61  62  63  64  68  69  70  71
  72  73  74  76  77  78  79  80  82  83  84  87  88  89  90  91  92  93
  94  95  97  98  99 100 101 102 103 104 106 107 108 110 111 112 113 114
 115 116 117 118 119

### 3. Multiclass classficiation using SVC

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, train_size=6, random_state=42)
print(set(iris.target))
svm = SVC()
svm.fit(X_train, y_train)
svm.decision_function(X_test[0:2])

{np.int64(0), np.int64(1), np.int64(2)}


array([[-0.2158595 ,  2.18568275,  1.1223534 ],
       [ 2.20412187,  0.93986322, -0.19206811]])

In [13]:
print(y_test[0:2])

[1 0]


### 4. Probabilistic classification using a neural net

### 5. Validity of a conformal predictor

In [14]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)

In [15]:
import math
import numpy as np
def dist(x1, x2):
    return np.linalg.norm(x1-x2)
n_train = X_train.shape[0]
n_test = X_test.shape[0]
dist_own = math.inf * np.ones(n_train)
dist_other = math.inf * np.ones(n_train)

for i in range(n_train-1):
    for j in range(i+1, n_train):
        current_dist = dist(X_train[i], X_train[j])
        if y_train[i] == y_train[j]:
            if (current_dist < dist_own[i]):
                dist_own[i] = current_dist
            if (current_dist < dist_own[j]):
                dist_own[j] = current_dist
        else:
            if (current_dist < dist_other[i]):
                dist_other[i] = current_dist
            if (current_dist < dist_other[j]):
                dist_other[j] = current_dist

In [None]:
score = np.zeros(n_train+1)
p = np.zeros((n_test, 3))
for j in range(n_test):
    for l in range(3):
        aug_dist_own = np.append(dist_own, math.inf)
        aug_dist_other = np.append(dist_other, math.inf)
        for i in range(n_train):
            current_dist = dist(X_train[i], X_test[j])
            if y_train[i] == l:
                if (current_dist < aug_dist_own[i]):
                    aug_dist_own[i] = current_dist
                if (current_dist < aug_dist_own[j]):
                    aug_dist_own[j] = current_dist
            else:
                if (current_dist < aug_dist_other[i]):
                    aug_dist_other[i] = current_dist
                if (current_dist < aug_dist_other[j]):
                    aug_dist_other[j] = current_dist
        
        for i in range(n_train + 1):
            if aug_dist_own[i] == 0:
                score[i] = math.inf
                if(aug_dist_other[i] == 0):
                    score[i] = 0
            else:
                score[i] = aug_dist_other[i] / aug_dist_own[i]
        p[j, l] = np.mean(score<=score[n_train])

0.22360679774997896
0.14142135623730925
2.4124676163629633
0.3464101615137755
0.5385164807134503
0.4242640687119281
0.6403124237432849
0.2828427124746193
2.3302360395462087
0.3464101615137753
2.1424285285628555
0.10000000000000053
1.2247448713915887
0.1414213562373093
0.33166247903553975
0.3741657386773937
0.6
0.1414213562373093
0.648074069840786
0.24494897427831722
1.568438714135812
0.6708203932499366
0.6557438524301997
0.14142135623730964
1.878829422805593
0.26457513110645964
0.5830951894845302
0.1999999999999993
2.0904544960366875
0.09999999999999998
2.004993765576342
0.26457513110645964
0.7071067811865478
0.24494897427831766
2.046948949045872
0.14142135623730995
2.130727575266252
0.14142135623730917
1.841195263952197
0.3605551275463988
1.3228756555322954
0.3605551275463989
0.36055512754639907
0.17320508075688762
1.7088007490635062
0.4472135954999579
1.740689518552921
0.17320508075688765
1.6401219466856725
0.264575131106459
0.7211102550927979
0.17320508075688815
1.9672315572906
0.09

  score[i] = aug_dist_other[i] / aug_dist_own[i]
