In [14]:
import numpy as np
import pandas as pd
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings('ignore')

In [15]:
data = pd.read_csv('VLagun_Phys_Years3.csv')
data

Unnamed: 0,PSU,O2,temp.,SS,DOC,TPOC,Windspeedinsitu,Depth,Years
0,3.757624,9.46,18.3,52.00,7.5000,8.5650,3.5,3.3,0
1,3.504707,9.89,19.1,50.00,7.8600,8.5200,0.0,3.6,0
2,3.757624,9.66,18.1,59.00,8.1720,8.4915,1.0,3.4,0
3,3.107266,10.36,19.5,46.00,7.8480,8.8320,0.0,2.9,0
4,2.619498,11.56,19.0,42.00,7.5360,9.2400,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...
115,4.500000,9.93,16.3,117.50,7.0180,13.0645,8.5,3.2,1
116,4.600000,10.02,16.2,115.00,7.0905,11.4115,10.2,3.3,1
117,4.500000,9.93,16.3,113.75,7.0615,11.7595,9.5,3.4,1
118,4.200000,10.30,16.0,102.50,7.1050,12.4410,10.5,3.3,1


In [16]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X.shape, y.shape

((120, 8), (120,))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [18]:
features, targets = X, y

In [19]:
train_features, test_features, train_targets, test_targets = train_test_split(features, targets, train_size=0.7, test_size=0.3, random_state=23, stratify=targets)

### Automatyczne porównanie skuteczności metod Cross Validation: k-fold, stratified i Monte Carlo (shuffle) w Logistic Regression, K-Nearest Neighbors, Support Vector Machines i Decision Tree models

K-NN acc. Bez CV

In [20]:
classifier = KNeighborsClassifier()
classifier.fit(train_features, train_targets)
predicitons_targets = classifier.predict(test_features)
print('Accuracy:', end=' ')
print(np.sum(predicitons_targets == test_targets) / float(len(test_targets)))

Accuracy: 0.8333333333333334


K-NN Class. + CV

In [21]:
scores = cross_val_score(classifier, features, targets, cv=3)
print('Cross validation scores', scores)
print('Mean score:', np.mean(scores))

Cross validation scores [0.75  0.725 0.6  ]
Mean score: 0.6916666666666668


KFold (3)

In [22]:
cv = KFold(n_splits=3, shuffle=True)
scores = cross_val_score(classifier, features, targets, cv=cv)
print('Cross validation scores', scores)
print('Mean score:', np.mean(scores))

Cross validation scores [0.875 0.825 0.825]
Mean score: 0.8416666666666667


Stratified (3)

In [23]:
cv = StratifiedKFold(n_splits=3, shuffle=True)
scores = cross_val_score(classifier, features, targets, cv=cv)
print('Cross validation scores', scores)
print('Mean score:', np.mean(scores))

Cross validation scores [0.9  0.8  0.85]
Mean score: 0.8500000000000001


Shuffle MC (3)

In [24]:
cv = ShuffleSplit(n_splits=3, test_size=0.3)
scores = cross_val_score(classifier, features, targets, cv=cv)
print('Cross validation scores', scores)
print('Mean score:', np.mean(scores))

Cross validation scores [0.72222222 0.86111111 0.83333333]
Mean score: 0.8055555555555557


In [25]:
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('KNeighbotsClassifier', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))

Dla modelu K-NN (cv=3) najwyższy acc. score uzyskał Stratified.

KFold (5)

In [26]:
cv = KFold(n_splits=5, shuffle=True, random_state=23)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print('Model:{0}, Score: mean={1:0.5f}, var={2:0.5f}'.format(name, score.mean(), score.var()))

Model:Logistic Regression, Score: mean=0.95833, var=0.00069
Model:KNeighbotsClassifier, Score: mean=0.85000, var=0.00250
Model:SVC, Score: mean=0.75833, var=0.00375
Model:DecisionTreeClassifier, Score: mean=0.87500, var=0.00625


Stratified (5)

In [27]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print('Model:{0}, Score: mean={1:0.5f}, var={2:0.5f}'.format(name, score.mean(), score.var())) 

Model:Logistic Regression, Score: mean=0.95000, var=0.00167
Model:KNeighbotsClassifier, Score: mean=0.81667, var=0.00111
Model:SVC, Score: mean=0.76667, var=0.00458
Model:DecisionTreeClassifier, Score: mean=0.88333, var=0.00097


Shuffle MC (5)

In [28]:
cv = ShuffleSplit(n_splits=5, random_state=23)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print('Model:{0}, Score: mean={1:0.5f}, var={2:0.5f}'.format(name, score.mean(), score.var())) 

Model:Logistic Regression, Score: mean=0.95000, var=0.00444
Model:KNeighbotsClassifier, Score: mean=0.86667, var=0.00444
Model:SVC, Score: mean=0.75000, var=0.00833
Model:DecisionTreeClassifier, Score: mean=0.90000, var=0.00389


Dla modeli (cv=5) największe acc. score uzyskały kolejno:
LR - KFold
K-NN - Shuffle MC
SVM - Stratified
DT - Shuffle MC

### Cross validation of SVM kernels porównanie

Linear

In [29]:
clf = SVC(kernel='linear', C=1, random_state=42).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9444444444444444

In [30]:
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.83333333, 0.91666667, 0.91666667, 1.        , 0.95833333])

In [31]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.9250 accuracy with a standard deviation of 0.0553


In [32]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([0.83216783, 0.91608392, 0.91666667, 1.        , 0.95826087])

In [33]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.9246 accuracy with a standard deviation of 0.0557


In [34]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.94444444, 0.94444444, 0.91666667, 0.97222222, 0.97222222])

In [35]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.9500 accuracy with a standard deviation of 0.0208


In [37]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.91666667, 0.91666667, 0.95833333, 0.91666667, 0.95833333])

In [38]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.9333 accuracy with a standard deviation of 0.0204


Poly

In [39]:
clf = SVC(kernel='poly', C=1, random_state=42).fit(X_train, y_train)
clf.score(X_test, y_test)

0.75

In [40]:
scores = cross_val_score(clf, X, y, cv=5)
scores

array([1.        , 0.79166667, 0.79166667, 0.66666667, 0.625     ])

In [41]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.7750 accuracy with a standard deviation of 0.1307


In [42]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([1.        , 0.77229602, 0.78221416, 0.59663866, 0.56363636])

In [43]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.7430 accuracy with a standard deviation of 0.1562


In [44]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.75      , 0.75      , 0.63888889, 0.77777778, 0.83333333])

In [45]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.7500 accuracy with a standard deviation of 0.0633


In [46]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.79166667, 0.70833333, 0.91666667, 0.79166667, 0.83333333])

In [47]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.8083 accuracy with a standard deviation of 0.0677


Radial

In [51]:
clf = SVC(kernel='rbf', C=1, random_state=42).fit(X_train, y_train)
clf.score(X_test, y_test)

0.6944444444444444

In [52]:
scores = cross_val_score(clf, X, y, cv=5)
scores

array([1.        , 0.79166667, 0.75      , 0.625     , 0.625     ])

In [53]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.7583 accuracy with a standard deviation of 0.1379


In [54]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([1.        , 0.77229602, 0.74285714, 0.60798548, 0.56363636])

In [55]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.7374 accuracy with a standard deviation of 0.1531


In [56]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.69444444, 0.77777778, 0.58333333, 0.77777778, 0.77777778])

In [57]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.7222 accuracy with a standard deviation of 0.0766


In [58]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.75      , 0.625     , 0.875     , 0.79166667, 0.79166667])

In [59]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.7667 accuracy with a standard deviation of 0.0816


Sigmoid

In [60]:
clf = SVC(kernel='sigmoid', C=1, random_state=42).fit(X_train, y_train)
clf.score(X_test, y_test)

0.3055555555555556

In [61]:
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.        , 0.20833333, 0.29166667, 0.375     , 0.5       ])

In [62]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.2750 accuracy with a standard deviation of 0.1679


In [63]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([0.        , 0.17241379, 0.29043478, 0.36507937, 0.33333333])

In [64]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.2323 accuracy with a standard deviation of 0.1332


In [65]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.30555556, 0.33333333, 0.44444444, 0.30555556, 0.36111111])

In [66]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.3500 accuracy with a standard deviation of 0.0515


In [67]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.25      , 0.375     , 0.125     , 0.33333333, 0.33333333])

In [68]:
print('%0.4f accuracy with a standard deviation of %0.4f' % (scores.mean(), scores.std()))

0.2833 accuracy with a standard deviation of 0.0890


Typy CV z najwyższym Acc. Score dla poszczególnych kernels:
linear - ShuffleSplit
poly - StratifiedKFold
radial - StratifiedKFold
sigmoid - ShuffleSplit