In [1]:
import numpy as np
import pickle
from time import gmtime, strftime

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, average_precision_score, f1_score, cohen_kappa_score, recall_score, log_loss
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier    
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Lars, ElasticNet, RidgeClassifier, BayesianRidge
from sklearn.naive_bayes import MultinomialNB  
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import RFE
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
import pandas as pd
from tqdm import tqdm
import re
from sklearn.feature_selection import SelectKBest, mutual_info_classif


time = strftime("%Y-%m-%d_%H-%M-%S", gmtime())

classifiers = [
                RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=10, max_features=1),
                KNeighborsClassifier(1),
            ]

# Define the threshold for binary classification
threshold = 0.5

# Define a custom scoring function
def custom_score(y_true, y_pred, fn=accuracy_score):
    # Apply the threshold to obtain binary predictions
    y_pred_binary = np.where(y_pred >= threshold, 1, 0)

    # Calculate and return the custom metric
    # Replace this with your own custom metric calculation
    return fn(y_true, y_pred_binary)



def eval_classifiers(X, y, labels, **kwargs):

    cv_scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    # 'cross_entropy_loss': make_scorer(log_loss, labels=labels),
    # 'average_precision_score' : make_scorer(average_precision_score, average='weighted', pos_label =0),
    'cohen_kappa_score' : make_scorer(cohen_kappa_score, labels=labels),
    'f1_score' : make_scorer(f1_score, average='weighted', labels=labels),
    'recall_score' : make_scorer(recall_score, average='weighted', labels=labels),
    # 'roc_auc_score': make_scorer(roc_auc_score, average='weighted', labels=labels, multi_class = 'ovr'),
    # 'specificity_score' : make_scorer(recall_score, pos_label=0, average='binary', labels=labels),
    }
    # Define the list of scoring metrics
    mean_res = pd.DataFrame()
    std_res = pd.DataFrame()
    
    for i, clf in tqdm(enumerate(classifiers), desc="Classifiers are running...."):
        # ax = plt.subplot(len(classifiers) + 1, i)
        clf_key = str(clf)
        
        
        clf = Pipeline(steps=[('scaler',StandardScaler()),
                            ('estimator',clf)])
        
        # Apply cross-validated model here.
        cv = StratifiedKFold(n_splits=100, shuffle=True)  # Specify the number of desired folds
        cv_scores = cross_validate(clf, X, y, cv=cv, scoring=cv_scorers, return_train_score=False, return_estimator=True, n_jobs=-1,verbose=2)  # Specify the list of scoring metrics
        # print(cv_scores)
        # print(np.array(cv_scores.values()))
        estimators = cv_scores['estimator']

        # Delete estimators
        del cv_scores['estimator']
        # Use sklearn metrics AUC.
        for j, key in enumerate(cv_scores.keys()):
            mean_res.loc[clf_key, key] = np.mean(cv_scores[key])
            std_res.loc[clf_key, key] = np.std(cv_scores[key])

    
    return estimators, mean_res


In [2]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('../features/all/features_train_HSV_GLCM_shape_MC.csv')

if len(data['label'].unique()) == 2:
    category_mapping = {'nevus': 1, 'others': 0} # Should we switch?
    labels = [0, 1]

else:
    data_exc = data[data['label'].isin(['mel', 'bcc', 'scc'])]
    category_mapping = {
                        'mel': 0, 
                        'bcc': 1, 
                        'scc': 2, 
                        }
    labels = [0, 1, 2]
    

y_train =  data_exc['label'].astype('category').map(category_mapping)

X = data_exc.iloc[:, 1:-1]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

# Standardization
scaler = StandardScaler()
X_train_ = scaler.fit_transform(X)

# PCA
# pca = PCA(0.75)
# pca.fit(X_train_)
# X_train_ = pca.transform(X_train_)


In [None]:
estimators, mean_res = eval_classifiers(X_train_, y_train, labels = labels) 

In [3]:
val_data = pd.read_csv('../features/all/features_val_HSV_GLCM_shape_MC.csv')

val_data_exc = val_data[val_data['label'].isin(list(category_mapping.keys()))]

y_val =  val_data_exc['label'].astype('category').map(category_mapping)

X = val_data_exc.iloc[:, 1:-1]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

# Standardization
X_val_ = scaler.transform(X)
# X_val_ = pca.transform(X_val_)

In [None]:
print("null accuracy:", 1/3 *100 )   

acc_scores = {'bcc': 0.0, 'mel': 0.0, 'scc': 0.0}
chk_score = 0.0

for estimator in tqdm(estimators):
    y_pred = estimator.predict(X_val_) 
    for i in range(3):
        lab = list(category_mapping.keys())[i]
        y_test_slice = y[y == i]
        y_pred_slice = y_pred[y == i]
        # print(i, len(y_test_slice), len(y_pred_slice), np.unique(y_test_slice), np.unique(y_pred_slice))
        acc_scores[lab] += accuracy_score(y_test_slice, y_pred_slice)
    
    chk_score += cohen_kappa_score(y, y_pred, labels=labels)        

for k, v in acc_scores.items():

    acc_scores[k] = v / len(estimators)

    chk_score /= len(estimators)

print(acc_scores, chk_score)

In [6]:
mean_res

Unnamed: 0,fit_time,score_time,test_accuracy_score,test_cohen_kappa_score,test_f1_score,test_recall_score
"RandomForestClassifier(criterion='entropy', max_depth=20, max_features=1,\n n_estimators=10)",0.491538,0.013985,0.522667,0.062969,0.486542,0.522667
KNeighborsClassifier(n_neighbors=1),0.136219,0.062024,0.530733,0.146475,0.524556,0.530733


In [7]:
# Define the base classifiers
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier()
clf3 = SVC(probability=True)

# Create a Voting Classifier using majority voting
voting_clf = VotingClassifier(estimators=[('dt', clf1), ('rf', clf2), ('svm', clf3)], voting='soft')

# # Initialize a StratifiedKFold cross-validator
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Perform cross-validation with the Voting Classifier
# predicted_labels = cross_val_predict(voting_clf, X_train_, y_train, cv=cv, method='predict')

# # Calculate the accuracy across all cross-validation folds
# accuracy = accuracy_score(y_train, predicted_labels)

# print("Cross-validated Accuracy:", accuracy)

# cohen_kappa_ = cohen_kappa_score(y_train, predicted_labels, labels=labels)
# print("Cohen-Kappa Score:", cohen_kappa_)

voting_clf.fit(X_train_, y_train)

y_val_pred = voting_clf.predict(X_val_)

cohen_kappa_ = cohen_kappa_score(y_val, y_val_pred, labels=labels)

print("Cohen-Kappa Score:", cohen_kappa_)

Cohen-Kappa Score: 0.418972008139445


In [15]:
def show_score():
    print("null accuracy:", 1/3 *100 )   

    acc_scores = {'bcc': 0.0, 'mel': 0.0, 'scc': 0.0}

    for i in range(3):
        lab = list(category_mapping.keys())[i]
        y_test_slice = y_val[y_val == i]
        y_pred_slice = y_val_pred[y_val == i]
        # print(i, len(y_test_slice), len(y_pred_slice), np.unique(y_test_slice), np.unique(y_pred_slice))
        acc_scores[lab] = accuracy_score(y_test_slice, y_pred_slice)

        chk_score = cohen_kappa_score(y_val, y_val_pred, labels=labels)        


    print(acc_scores, chk_score)

show_score()

null accuracy: 33.33333333333333
{'bcc': 0.6686746987951807, 'mel': 0.7743362831858407, 'scc': 0.10638297872340426} 0.418972008139445


## Resample

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight

ros = RandomOverSampler(random_state=42)

X_train__, y_train_ = ros.fit_resample(X_train_, y_train)

# Define the base classifiers
clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier()
clf3 = SVC(probability=True)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

# Create a dictionary mapping classes to their respective weights
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

print(class_weight_dict)

# Create a Voting Classifier using majority voting
voting_clf = VotingClassifier(estimators=[('dt', clf1), ('rf', clf2), ('svm', clf3)], voting='soft', weights=class_weights)

# Fit & Predict
voting_clf.fit(X_train__, y_train_) 

y_val_pred = voting_clf.predict(X_val_)

{0: 0.6244010320678216, 1: 0.8499749121926744, 2: 4.50531914893617}


In [10]:
def show_score():
    print("null accuracy:", 1/3 *100 )   

    acc_scores = {'bcc': 0.0, 'mel': 0.0, 'scc': 0.0}

    for i in range(3):
        lab = list(category_mapping.keys())[i]
        y_test_slice = y_val[y_val == i]
        y_pred_slice = y_val_pred[y_val == i]
        # print(i, len(y_test_slice), len(y_pred_slice), np.unique(y_test_slice), np.unique(y_pred_slice))
        acc_scores[lab] = accuracy_score(y_test_slice, y_pred_slice)

        chk_score = cohen_kappa_score(y_val, y_val_pred, labels=labels)        


    print(acc_scores, chk_score)

show_score()

null accuracy: 33.33333333333333
{'bcc': 0.6987951807228916, 'mel': 0.799410029498525, 'scc': 0.13829787234042554} 0.4678973445669794


In [11]:
(0.7 + 0.8 + 0.14) / 3

0.5466666666666667