In [18]:
import numpy as np
import pickle
from time import gmtime, strftime

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, average_precision_score, f1_score, cohen_kappa_score, recall_score, log_loss
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier    
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Lars, ElasticNet, RidgeClassifier, BayesianRidge
from sklearn.naive_bayes import MultinomialNB  
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
import pandas as pd
from tqdm import tqdm
import re
from sklearn.feature_selection import SelectKBest, mutual_info_classif


time = strftime("%Y-%m-%d_%H-%M-%S", gmtime())

classifiers = [
                # RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=10, max_features=1),
                # ExtraTreesClassifier(criterion='entropy', n_estimators=100, random_state=0),
                # GaussianProcessClassifier(kernel=1.0 * RBF(1.0), random_state=0),
                KNeighborsClassifier(1),
            ]

# Define the threshold for binary classification
threshold = 0.5

# Define a custom scoring function
def custom_score(y_true, y_pred, fn=accuracy_score):
    # Apply the threshold to obtain binary predictions
    y_pred_binary = np.where(y_pred >= threshold, 1, 0)

    # Calculate and return the custom metric
    # Replace this with your own custom metric calculation
    return fn(y_true, y_pred_binary)



def eval_classifiers(X, y, labels, **kwargs):

    cv_scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    # 'cross_entropy_loss': make_scorer(log_loss, labels=labels),
    # 'average_precision_score' : make_scorer(average_precision_score, average='weighted', pos_label =0),
    'cohen_kappa_score' : make_scorer(cohen_kappa_score, labels=labels),
    'f1_score' : make_scorer(f1_score, average='weighted', labels=labels),
    'recall_score' : make_scorer(recall_score, average='weighted', labels=labels),
    # 'roc_auc_score': make_scorer(roc_auc_score, average='weighted', labels=labels, multi_class = 'ovr'),
    # 'specificity_score' : make_scorer(recall_score, pos_label=0, average='binary', labels=labels),
    }
    # Define the list of scoring metrics
    mean_res = pd.DataFrame()
    std_res = pd.DataFrame()
    
    for i, clf in tqdm(enumerate(classifiers), desc="Classifiers are running...."):
        # ax = plt.subplot(len(classifiers) + 1, i)
        clf_key = str(clf)
        
        
        clf = Pipeline(steps=[('scaler',StandardScaler()),
                            ('pca', PCA(n_components=200)), 
                            ('estimator',clf)])
        
        # Apply cross-validated model here.
        cv = StratifiedKFold(n_splits=100, shuffle=True)  # Specify the number of desired folds
        cv_scores = cross_validate(clf, X, y, cv=cv, scoring=cv_scorers, return_train_score=False, return_estimator=True, n_jobs=-1,verbose=2)  # Specify the list of scoring metrics
        # print(cv_scores)
        # print(np.array(cv_scores.values()))
        estimators = cv_scores['estimator']

        # Delete estimators
        del cv_scores['estimator']
        # Use sklearn metrics AUC.
        for j, key in enumerate(cv_scores.keys()):
            mean_res.loc[clf_key, key] = np.mean(cv_scores[key])
            std_res.loc[clf_key, key] = np.std(cv_scores[key])

    
    return estimators, mean_res


In [19]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('../features/all/features_train_HSV_GLCM_shape_MC.csv')

if len(data['label'].unique()) == 2:
    category_mapping = {'nevus': 1, 'others': 0} # Should we switch?
    labels = [0, 1]

else:
    category_mapping = {'nev': 0, 'ack': 1, 'bcc': 2, 'bkl': 3, 'def': 4, 'mel':5, 'scc': 6, 'vac': 7}
    labels = np.arange(8)
    
y =  data['label'].astype('category').map(category_mapping)

X = data.iloc[:, 1:-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

# Standardization
scaler = StandardScaler()
X_train_ = scaler.fit_transform(X_train)

X_test_ = scaler.transform(X_test)

estimators, cv_metrics = eval_classifiers(X_train_, y_train, labels = labels) 

Classifiers are running....: 0it [00:00, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
Classifiers are running....: 1it [01:22, 82.58s/it]


In [24]:
# estimator = estimators[0]
# print(estimator)
print("null accuracy:", 1/8 *100 )   
acc_scores = {'nev': 0.0, 'ack': 0.0, 'bcc': 0., 'bkl': 0.0, 'def': 0.0, 'mel': 0.0, 'scc': 0.0, 'vac': 0.0}
for estimator in tqdm(estimators):
    y_pred = estimator.predict(X_test_) 
    for i in range(8):
        lab = list(category_mapping.keys())[i]
        y_test_slice = y_test[y_test == i]
        y_pred_slice = y_pred[y_test == i]
        # print(i, len(y_test_slice), len(y_pred_slice), np.unique(y_test_slice), np.unique(y_pred_slice))
        acc_scores[lab] += accuracy_score(y_test_slice, y_pred_slice)*100

for k, v in acc_scores.items():

    acc_scores[k] = v / len(estimators)

print(acc_scores)

null accuracy: 12.5


100%|██████████| 100/100 [02:08<00:00,  1.29s/it]

{'nev': 67.24719585849873, 'ack': 20.96794871794874, 'bcc': 35.978260869565226, 'bkl': 28.411016949152526, 'def': 9.20930232558141, 'mel': 37.8955773955774, 'scc': 13.646017699115053, 'vac': 15.444444444444464}



