In [10]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
import gc
from sklearn.preprocessing import label_binarize

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble   import RandomForestClassifier
from sklearn.svm import NuSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate,cross_val_predict
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer,classification_report
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import  VotingClassifier

In [2]:
pd.set_option('display.max_columns',         500)
pd.set_option('display.width',       1000)
pd.set_option('display.max_colwidth',        -1)

In [3]:
df_noohe = pd.read_pickle(
  '..\\data\\no_OHE.pkl'
)
df_noohe.drop('soil', axis=1, inplace=True)

In [4]:
df_noohe.head()

Unnamed: 0,elevation,aspect,slope,hor_dist_to_roadway,hor_dist_to_ignition,cover_type,dist_to_hydr,hillshade,wilderness_area,geo_soil,climate_soil
0,2596,51,3,510,6279,5,258.0,352.943338,rawah,7,7
1,2590,56,2,390,6225,5,212.084889,355.56434,rawah,7,7
2,2804,139,9,3180,6121,2,275.769832,360.034721,rawah,4,7
3,2785,155,18,3090,6211,2,269.235956,358.011173,rawah,7,7
4,2595,45,2,391,6172,5,153.003268,354.479901,rawah,7,7


In [13]:
Classifiers = [
    ("knn", KNeighborsClassifier(n_neighbors=7)),
    ("logistic", LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')),
    ("random_forest", RandomForestClassifier(n_estimators=200, n_jobs=-1,
                                             max_depth=12, min_samples_leaf=3, criterion='entropy', random_state=42)),
    ("svc",NuSVC(nu=0.01, kernel='poly', gamma=0.001,
           random_state=42, probability=True))
]

In [7]:
X = df_noohe.drop(['cover_type','wilderness_area'],axis=1)
Y = df_noohe['cover_type']

In [8]:
scoring = {
    'f1_micro': 'f1_micro',
    'precison_micro': 'precision_micro',
    'roc_auc_micro': make_scorer(roc_auc_score, average='micro', needs_proba=True)
}
y_bin = label_binarize(Y, classes=[0, 1, 2, 3, 4, 5, 6])
tpr_all= pd.DataFrame()
fpr_all= pd.DataFrame()
roc_auc_all = pd.DataFrame()
n_classes = 7
for clf_name,clf  in Classifiers:
    clf = make_pipeline(MinMaxScaler(), clf)
#     cv = ShuffleSplit(n_splits=5, test_size=0.4, random_state=42)
    cv = StratifiedKFold(n_splits=5, random_state=42)
    y_score = cross_val_predict(clf, X,
                                Y, cv=cv, verbose=2,
                                n_jobs=-1, method='predict_proba')
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    tpr_all[clf_name] = tpr
    fpr_all[clf_name] = fpr
    roc_auc_all[clf_name] = roc_auc
    print(clf_name)
    print(roc_auc)
#     print(classification_report(Y, scores))
#     print('NAME ', clf_name , ' Scores ',  scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.3min remaining:  3.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.5min finished


knn
{0: nan, 1: 0.5143248106067271, 2: 0.4480225404073755, 3: 0.5444143071335364, 4: 0.45984626425600716, 5: 0.4768894300408067, 6: 0.46105438706987556, 'micro': 0.5325773466352328}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   48.0s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   48.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   48.9s finished


logistic
{0: nan, 1: 0.3364981086788552, 2: 0.6487951913378498, 3: 0.9682074207730627, 4: 0.4033876074775494, 5: 0.718054911868977, 6: 0.08027515860470867, 'micro': 0.557768857821737}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.5min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished


random_forest
{0: nan, 1: 0.43953283641737767, 2: 0.6200318932239318, 3: 0.9378765667380552, 4: 0.22193162264819577, 5: 0.7208643972689657, 6: 0.09094398799723283, 'micro': 0.5798648618067721}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 31.6min remaining: 47.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 31.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 31.7min finished


svc
{0: nan, 1: 0.468841107792054, 2: 0.48287473902258, 3: 0.9495200401523033, 4: 0.8581116375730872, 5: 0.5861780820988723, 6: 0.2778848513449658, 'micro': 0.5115716668893722}


In [14]:
voting_clf = VotingClassifier(estimators=Classifiers,voting='soft')

In [None]:
clf = make_pipeline(MinMaxScaler(), voting_clf)
#     cv = ShuffleSplit(n_splits=5, test_size=0.4, random_state=42)
cv = StratifiedKFold(n_splits=5, random_state=42)
y_score = cross_val_predict(clf, X,
                                Y, cv=cv, verbose=2,
                                n_jobs=-1, method='predict_proba')
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
tpr_all[clf_name] = tpr
fpr_all[clf_name] = fpr
roc_auc_all[clf_name] = roc_auc
print(clf_name)
print(roc_auc)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
