# Ensemble on probabilities (XGBoost)

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from imblearn.over_sampling import SMOTE 
from collections import Counter
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, plotting
from sklearn.model_selection import train_test_split
from hpsklearn import HyperoptEstimator
from hyperopt import tpe
from sklearn.metrics import accuracy_score

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


## 1. Create ensemble model dataset


In [2]:
# def create_merge_file(mode):
#     df =pd.read_csv(f'gs://oro-ds-test-bucket/sdd_acne_files/mlflow_prod/f7d0540ba8eb49f2acab131e6548f037/artifacts/all_preds/vit-0-{mode}_ensemble_all_predictions.csv')
#     df =df.merge(pd.read_csv(f'gs://oro-ds-test-bucket/sdd_acne_files/mlflow_prod/f7d0540ba8eb49f2acab131e6548f037/artifacts/all_preds/vit-1-{mode}_ensemble_all_predictions.csv'), on=['src', 'label'], how='outer')
#     df =df.merge(pd.read_csv(f'gs://oro-ds-test-bucket/sdd_acne_files/mlflow_prod/f7d0540ba8eb49f2acab131e6548f037/artifacts/all_preds/bit-0-{mode}_ensemble_all_predictions.csv'), on=['src', 'label'], how='outer')
#     df =df.merge(pd.read_csv(f'gs://oro-ds-test-bucket/sdd_acne_files/mlflow_prod/f7d0540ba8eb49f2acab131e6548f037/artifacts/all_preds/bit-1-{mode}_ensemble_all_predictions.csv'), on=['src', 'label'], how='outer')
#     return df
# df_val = create_merge_file('val')
# df_train = create_merge_file('train')
# df_train =df_train.dropna()
# df_all = pd.concat([df_val,df_train])
# test_data = create_merge_file('test')

In [3]:
#df_val = pd.read_csv('34.152.54.213')
df_train = pd.read_csv('Merged_For_Ensemble_probs.csv')
df_train =df_train.dropna()
test_data = pd.read_csv('test_Merged_For_Ensemble_probs.csv')


Here we want to divide the training set in training and validation set.
We use the file named 'vit_test.csv' to create the validation set.
In fact, all the images contain in this file were use to validate the various models. 

In [4]:
df_val = pd.read_csv('bit_test.csv')
df_val['src'] = df_val['filename'].apply(lambda x: x.split('/')[-1])
masks =df_train['src'].isin(df_val['src'].unique())
df_val= df_train[masks]
df_training = df_train[~masks]
print(f'There are {df_val.shape[0]} images in the validation set.')
print(f'There are {df_training.shape[0]} images in the training set.')

There are 988 images in the validation set.
There are 5593 images in the training set.


The validation set is used to finetuned the hyperparameters of the ensemble model.

In [5]:
columns = set(df_train.columns)
columns.remove('true_label')
columns.remove('src')

Once we find the best combination of hyperparameters, we use all the training + validation images to train the emsemble model.

In [6]:
# The final training set 
train_data= df_train

In [7]:
X_res, y_res = train_data[columns], train_data['true_label']

## 2. Hyperparameter search -> Hyperopt search

In [9]:
def get_top_k_classification_report(df_preds: pd.DataFrame, k: int, classes: list):
    """Generate top-3 classification report

    Args:
        df_preds (pd.DataFrame): predictions
        classes (list): list of classes

    Returns:
        dict: top-3 classification report
    """        
    exp_name= 'top'+str(k)+'_prediction'
    print(exp_name)
    prob_columns = ['prob_' + disease for disease in classes]
    preds_list = []
    for i in range(k):
        str_name = 'Pred'+ str(i+1) 
        str_name_probs = 'Prob' + str(i+1)
        preds_list.append(str_name)
        df_preds[str_name] =df_preds[prob_columns].apply(lambda x: x.sort_values(ascending=False).index[i].replace('prob_', ''), axis=1)
        df_preds[str_name_probs] = df_preds[prob_columns].apply(lambda x: x.sort_values(ascending=False).values[1], axis=1)
    df_preds['labels'] = df_preds['label']
    df_preds[exp_name] = df_preds.apply(lambda row: bool(set([row['labels']]).intersection(set(row[preds_list].values))), axis=1).reset_index(drop=True)
    #print(f"{exp_name} Accuracy: {df_preds[exp_name].value_counts()[True]/df_preds.shape[0]:0.3f}%.")
    y_pred = df_preds.apply(lambda x : x['labels'] if x[exp_name] else x['Pred1'] , axis=1).to_list()
    y_score = df_preds.apply(lambda x : x['labels'], axis=1).to_list()
    report = classification_report( y_score,y_pred, digits=4, output_dict=True)
    #print(classification_report( y_score,y_pred, digits=4, output_dict=False))
    return pd.DataFrame(report).transpose()

We are trying to find the best combination of these hyperparameters: [learning_rate, max_depth, min_child_weight, gamma, subsample, colsample_bytree
]

In [10]:
def optimize():
    space = {
            'learning_rate': hp.quniform('learning_rate', 0.01, 0.3, 0.001),
            # Control complexity (control overfitting)
            # Maximum depth of a tree: default 6 -> range: [0:∞]
            'max_depth':  hp.choice('max_depth', np.arange(5, 10, dtype=int)),
            # Minimum sum of instance weight (hessian) needed in a child: default 1
            'min_child_weight': hp.quniform('min_child_weight', 1, 3, 1),
            # Minimum loss reduction required: default 0 -> range: [0,∞]
            'gamma': hp.quniform('gamma', 0, 5, 0.5),

            # Add randomness to make training robust to noise (control overfitting)
            # Subsample ratio of the training instance: default 1
            'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
            # Subsample ratio of columns when constructing each tree: default 1
            'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
            
            # For reproducibility
            'seed': 42,
            # Faster computation
            'tree_method':'gpu_hist'
            }
        
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=500)
    
    return best

We use the validation set to evaluate the models.

In [11]:
X_train, y_train = df_training[columns].reset_index(drop=True),df_training['true_label'].reset_index(drop=True)
X_test, y_test = df_val[columns].reset_index(drop=True), df_val['true_label'].reset_index(drop=True)

In [20]:
def score(params):
    #print(params)
    xg_reg = xgb.XGBClassifier(**params)
    xg_reg.fit(X_train,y_train)
    
    preds_probs = xg_reg.predict_proba(X_test)
    #preds_probs = xg_reg.predict_proba(test_data[columns])  
    le = preprocessing.LabelEncoder()
    le.fit(xg_reg.classes_)
    
    df_final = pd.DataFrame()
    df_final['Pred1'] =le.inverse_transform(np.flip(np.argsort(preds_probs),axis=1)[:, 0])
    df_final['Pred2'] =le.inverse_transform(np.flip(np.argsort(preds_probs),axis=1)[:, 1])
    df_final['Pred3'] =le.inverse_transform(np.flip(np.argsort(preds_probs),axis=1)[:, 2])
    
    df_final['label'] = y_test #test_data['label']
    preds_list =['Pred1', 'Pred2', 'Pred3']
    df_final['is_top_3'] = df_final.apply(lambda row: bool(set([row['label']]).intersection(set(row[preds_list].values))), axis=1)
    df_final['preds'] = df_final.apply(lambda x : x['label'] if x['is_top_3'] else x['Pred1'] , axis=1)

    score = accuracy_score(df_final['label'], df_final['preds'])

    return score

In [21]:
trials = Trials()

best_hyperparams = optimize()
print(f'The best hyperparameters are: {best_hyperparams}')

 ... (more hidden) ...
The best hyperparameters are: {'colsample_bytree': 1.0, 'gamma': 4.5, 'learning_rate': 0.28, 'max_depth': 1, 'min_child_weight': 3.0, 'subsample': 0.5}


Let's visualize the results.

In [22]:
# Save trials as a pandas dataframe
summary_table = pd.DataFrame()

for i in range(len(trials.trials)-1):
    row = pd.concat([pd.DataFrame({'loss':[trials.trials[i]['result']['loss']]}), \
                     pd.DataFrame(trials.trials[i]['misc']['vals'])], axis=1)
    summary_table = summary_table.append(row)

In [23]:
summary_table.sort_values(['loss'])

Unnamed: 0,loss,colsample_bytree,gamma,learning_rate,max_depth,min_child_weight,subsample
0,0.782389,1.00,4.5,0.280,1,3.0,0.50
0,0.783401,0.95,5.0,0.299,4,3.0,0.50
0,0.785425,0.95,3.5,0.294,0,3.0,0.50
0,0.785425,0.95,4.0,0.294,0,3.0,0.50
0,0.785425,1.00,3.0,0.282,0,3.0,0.50
0,0.785425,1.00,4.0,0.281,0,3.0,0.50
0,0.786437,1.00,5.0,0.274,1,3.0,0.50
0,0.786437,1.00,4.5,0.273,1,3.0,0.50
0,0.786437,1.00,3.0,0.281,0,3.0,0.50
0,0.786437,0.90,4.0,0.300,0,3.0,0.50


# 3. Final XGboost model

We use the best combination of hyperparameter.

In [8]:
np.arange(5, 10, dtype=int)

array([5, 6, 7, 8, 9])

In [10]:
xg_reg = xgb.XGBClassifier(colsample_bytree=0.55, gamma=2.5, learning_rate=0.08, max_depth=7, min_child_weight=2, subsample=1,  seed=42, tree_method='gpu_hist')
#xg_reg = xgb.XGBClassifier(colsample_bytree=0.60, gamma=4.5, learning_rate=0.066, max_depth=5, min_child_weight=3.0, subsample=0.50,  seed=42, tree_method='gpu_hist')


In [11]:
xg_reg.fit(X_res, y_res)
preds_probs = xg_reg.predict_proba(test_data[columns])
le = preprocessing.LabelEncoder()
le.fit(xg_reg.classes_)
df_final = pd.DataFrame()
df_final['Pred1'] =le.inverse_transform(np.flip(np.argsort(preds_probs),axis=1)[:, 0])
df_final['Pred2'] =le.inverse_transform(np.flip(np.argsort(preds_probs),axis=1)[:, 1])
df_final['Pred3'] =le.inverse_transform(np.flip(np.argsort(preds_probs),axis=1)[:, 2])
df_final['label'] = test_data['label']
preds_list =['Pred1', 'Pred2', 'Pred3']
df_final['is_top_3'] = df_final.apply(lambda row: bool(set([row['label']]).intersection(set(row[preds_list].values))), axis=1)
df_final['preds'] = df_final.apply(lambda x : x['label'] if x['is_top_3'] else x['Pred1'] , axis=1)
print(classification_report( df_final['label'], df_final['preds'], digits=4, output_dict=False))
#pd.DataFrame(classification_report( df_final['label'], df_final['preds'], digits=4, output_dict=True)).transpose().to_csv('resport.csv')

                                     precision    recall  f1-score   support

                       acne_comedos     0.8261    0.6552    0.7308        29
                        acne_cystic     0.9487    0.8409    0.8916        44
                    acne_excoriated     0.9091    0.5263    0.6667        19
                         acne_mixed     0.8934    0.9778    0.9337       180
                         acne_scars     0.9444    0.8500    0.8947        20
                  actinic_keratosis     0.9375    0.8824    0.9091        17
                alopecia_androgenic     0.9143    0.9412    0.9275        34
                    alopecia_areata     0.8333    0.7692    0.8000        13
                  atopic_dermatitis     0.8828    0.8692    0.8760       130
               cheilitis_eczematous     0.7826    0.8182    0.8000        22
                chronic_hand_eczema     0.9636    0.9636    0.9636        55
                        dyshidrosis     0.7885    0.9762    0.8723        4