# A machine learning decision tree approach

The iMeta algorithm is essentially a decision tree algorithm, where the variables and threshold for the decisions at each step are manually specified based on human analysis. The simplest way to apply machine learning techniques to the problem would be to use a similar structure to iMeta, which is a decision tree, but use standard ML training techiniques to learn the parameters such as what thresholds to use and how many branches/leaves to have in the tree for the best results. 

In [1]:
import os
import sys
import pathlib
import functools
import itertools

In [2]:
import pandas
import numpy

In [3]:
import matplotlib
import matplotlib.pyplot
import warnings
warnings.filterwarnings('ignore')

In [4]:
import ipywidgets
import time

In [5]:
import sklearn
import sklearn.model_selection
import sklearn.preprocessing
import sklearn.tree
import sklearn.metrics

In [6]:
root_repo_dir = pathlib.Path().absolute().parent
sys.path = [os.path.join(root_repo_dir)] + sys.path

In [7]:
import xbt.dataset
from xbt.dataset import XbtDataset, UNKNOWN_STR, cat_output_formatter, check_value_found
from xbt.imeta import imeta_classification, XBT_MAX_DEPTH

In [8]:
# Set up some site specific parameters for the notebook
try:
    environment = os.environ['XBT_ENV_NAME']
except KeyError:
    environment = 'pangeo'

In [9]:
root_data_dirs = {
    'MO_scitools': '/data/users/shaddad/xbt-data/',
    'pangeo': '/data/misc/xbt-data/',
}
env_date_ranges = {
    'MO_scitools': (1966,2015),
    'pangeo': (1966,2015)
}

In [10]:
# Set up some dataset specific parameters
root_data_dir = root_data_dirs[environment]
year_range = env_date_ranges[environment]

In [40]:
cv_metric_names = ['f1_weighted','precision_weighted','recall_weighted']
input_feature_names = ['country','max_depth', 'year', 'lat', 'lon']
target_feature_name = 'instrument'

In [12]:
input_dir_name = 'csv_with_imeta'
exp_out_dir_name = 'experiment_outputs'

In [13]:
experiment_name = 'nb_single_decisionTree_country'
classifier_class = sklearn.tree.DecisionTreeClassifier
classifier_name = 'decision_tree'
suffix='countryAndLatLon'

In [14]:
classifier_opts = {'max_depth': 20,
                   'min_samples_leaf': 1,
                   'criterion': 'gini'
                  }

In [15]:
xbt_input_dir = os.path.join(root_data_dir, input_dir_name)
xbt_output_dir = os.path.join(root_data_dir, exp_out_dir_name, experiment_name)

In [16]:
# create the output for this experiment if it doesn't exist
if not os.path.isdir(xbt_output_dir):
    os.makedirs(xbt_output_dir)
print(f'outputting to {xbt_output_dir}')

outputting to /data/users/shaddad/xbt-data/experiment_outputs/nb_single_decisionTree_country


In [17]:
output_fname_template = 'xbt_output_{exp_name}_{subset}.csv'
result_fname_template = 'xbt_metrics_{classifier}_{suffix}.csv'

In [18]:
%%time
xbt_full_dataset = XbtDataset(xbt_input_dir, year_range)

CPU times: user 53.1 s, sys: 16.3 s, total: 1min 9s
Wall time: 1min 14s


## Data preparation

We are only testing on the labelled data, to be able to evluate performance. The XbtDataset class has filtered out some bad data including profiles with maximum depths less that 0.0 or greater than 2000.0. There were also some profiles with bad date entries, which have been excluded for now.

In [19]:
%%time
xbt_labelled = xbt_full_dataset.filter_obs({'labelled': 'labelled'})

CPU times: user 225 ms, sys: 55 ms, total: 280 ms
Wall time: 278 ms


In [20]:
_ = xbt_labelled.get_ml_dataset(return_data = False)

In [21]:
_ = xbt_labelled.filter_features(['instrument','model','manufacturer']).encode_target(return_data = False)

In [102]:
%%time
unseen_cruise_numbers = xbt_labelled.sample_feature_values('cruise_number', fraction=0.1)
validation_var_name = 'validation'
validation_part_name = 'validation_part'
validation_whole_name = 'validation_whole'
validate_indices = list(itertools.chain.from_iterable([list(xbt_labelled.filter_obs({target_feature_name: selected_instrument}).xbt_df.sample(frac=0.1).index) 
for selected_instrument in xbt_labelled['instrument'].unique()]))

CPU times: user 2.03 s, sys: 2.99 ms, total: 2.03 s
Wall time: 2.03 s


In [103]:
xbt_labelled.xbt_df[validation_var_name] = xbt_labelled.xbt_df['cruise_number'].isin(unseen_cruise_numbers)
xbt_labelled.xbt_df[validation_whole_name] = xbt_labelled.xbt_df['cruise_number'].isin(unseen_cruise_numbers)
xbt_labelled.xbt_df.loc[validate_indices, validation_var_name] = True
xbt_labelled.xbt_df[validation_part_name] = False
xbt_labelled.xbt_df.loc[validate_indices, validation_part_name] = True


In [24]:
%%time
xbt_unseen = xbt_labelled.filter_obs({validation_var_name: True})
xbt_working = xbt_labelled.filter_obs({validation_var_name: False})

CPU times: user 107 ms, sys: 2.02 ms, total: 109 ms
Wall time: 107 ms


In [25]:
imeta_classes = xbt_labelled.xbt_df.apply(imeta_classification, axis=1)
imeta_model = imeta_classes.apply(lambda t1: t1[0])
imeta_manufacturer = imeta_classes.apply(lambda t1: t1[1])

In [26]:
imeta_instrument = imeta_classes.apply(lambda t1: f'XBT: {t1[0]} ({t1[1]})') 

We are currently training and evaulating separately for model and manufacturer. We will also need to train and evaulate together as this is ultimately what is wanted (a combined probe model and manufacturer field).

We are using the default 80/20 split in scikit-learn for now. Further work will need to do proper cross validation where several different splits are randomly selected to verify our results are not an artifact of the randomly chosen split.

In [27]:
%%time
num_resamples_per_class = 50000
instrument_sample_list = [instrument1 for instrument1, count1 in zip(xbt_labelled.xbt_df[target_feature_name].value_counts().index, xbt_labelled.xbt_df[target_feature_name].value_counts()) if count1 > 100] 
instrument_sample_list
resampled_profiles_list = [xbt_working.filter_obs({target_feature_name: ins1}).xbt_df.sample(num_resamples_per_class, replace=True) 
     for ins1 in instrument_sample_list]
resampled_training_indices = list(set(itertools.chain.from_iterable([list(rp1.index) for rp1 in resampled_profiles_list])))


CPU times: user 1.48 s, sys: 62.1 ms, total: 1.54 s
Wall time: 1.54 s


In [28]:
%%time
xbt_working.xbt_df['resample_train'] = xbt_working.xbt_df.index.isin(resampled_training_indices)

CPU times: user 50.1 ms, sys: 845 µs, total: 51 ms
Wall time: 48.5 ms


In [29]:
resampled_df = pandas.concat(
    resampled_profiles_list,
    ignore_index=True,
)
xbt_resampled_train_all = XbtDataset(xbt_input_dir, year_range, df=resampled_df)
xbt_resampled_train_all._feature_encoders = xbt_labelled._feature_encoders
xbt_resampled_train_all._target_encoders = xbt_labelled._target_encoders

In [30]:
%%time
xbt_resampled_test_all = xbt_working.filter_obs({'resample_train': False})

CPU times: user 52.4 ms, sys: 31.2 ms, total: 83.6 ms
Wall time: 81.1 ms


In [31]:
%%time
xbt_train_all, xbt_test_all = xbt_working.train_test_split(refresh=True, features=[target_feature_name, 'year'])

CPU times: user 1min 54s, sys: 12 s, total: 2min 6s
Wall time: 2min 6s


get separate sets of 2 types of unseen data.
* data from cruises where the whole cruise is in the unseen data 
* data where some of the profiles are in the training data

In [122]:
xbt_unseen_whole = xbt_labelled.filter_obs({validation_whole_name: True})
xbt_unseen_part = xbt_labelled.filter_obs({validation_part_name: True})

In [32]:
X_train_all = xbt_train_all.filter_features(input_feature_names).get_ml_dataset()[0]
X_test_all = xbt_test_all.filter_features(input_feature_names).get_ml_dataset()[0]
y_instr_train_all = xbt_train_all.filter_features([target_feature_name]).get_ml_dataset()[0]
y_instr_test_all = xbt_test_all.filter_features([target_feature_name]).get_ml_dataset()[0]


In [33]:
X_resampled_train_all = xbt_resampled_train_all.filter_features(input_feature_names).get_ml_dataset()[0]
X_resampled_test_all = xbt_resampled_test_all.filter_features(input_feature_names).get_ml_dataset()[0]
y_resampled_instr_train_all = xbt_resampled_train_all.filter_features([target_feature_name]).get_ml_dataset()[0]
y_resampled_instr_test_all = xbt_resampled_test_all.filter_features([target_feature_name]).get_ml_dataset()[0]


In [34]:
X_unseen_all = xbt_unseen.filter_features(input_feature_names).get_ml_dataset()[0]
y_instr_unseen_all = xbt_unseen.filter_features([target_feature_name]).get_ml_dataset()[0]

## Training the classifier

We are using the scikit-learn classifier as the closest analogue to the structure of the iMeta algorithm. This tree can have many more nodes and leaves than iMeta though. it is quick to train and evaluate so it is a useful starting point for setting up the ML processing pipelines, as all the scikit-learn classifiers have a common interface. 

For the model and manufacturer, we train a Decision ree Classifier, then use it to predict values for the train and test sets. We then calculate the accuracy metrics for each for the whole dataset. 

I am using precision, recall and F1 as fairly standard ML metrics of accuracy. Recall is what has been used in the two previous papers (Palmer et. al, Leahy and Llopis et al) so that is the focus. Support is a useful to see what proportion of the profiles in the dataset belong to each of the different classes.

In [35]:
clf_dt_instr1 = classifier_class(**classifier_opts)
clf_dt_instr1.fit(X_train_all,y_instr_train_all)

DecisionTreeClassifier(max_depth=20)

In [36]:
clf_dt_instr_resampled1 = classifier_class(**classifier_opts)
clf_dt_instr_resampled1.fit(X_resampled_train_all,y_resampled_instr_train_all)

DecisionTreeClassifier(max_depth=20)

In [37]:
metrics_per_class_all = {}
metrics_avg_all = {}
metrics_per_class_all_resampled = {}
metrics_avg_all_resampled = {}

In [38]:
metrics_per_class_all[target_feature_name] = list(xbt_labelled._feature_encoders[target_feature_name].classes_)
metrics_per_class_all_resampled[target_feature_name] = list(xbt_labelled._feature_encoders['instrument'].classes_)

In [73]:
def calc_metric_subset(xbt_subset, clf1, filter_dict, inputs, target_feature, metric_func, metric_args_dict):
    xbt_selected = xbt_subset.filter_obs(filter_dict)
    if xbt_selected.shape[0] == 0:
        return 0.0
    metrics_result = metric_func(
        clf1.predict(xbt_selected.filter_features(inputs).get_ml_dataset()[0]),
        xbt_selected.filter_features([target_feature]).get_ml_dataset()[0],
        **metric_args_dict)
    return metrics_result                       

In [116]:
do_avg_args_dict = {'labels':list(range(0,len(metrics_per_class_all['instrument']))), 
                    'average':'micro'}
metrics_defs_dict = {
    'recall': {'metric_func': sklearn.metrics.recall_score, 'metric_args_dict': do_avg_args_dict},
    'precision': {'metric_func': sklearn.metrics.precision_score, 'metric_args_dict': do_avg_args_dict},
    'accuracy': {'metric_func': sklearn.metrics.accuracy_score, 'metric_args_dict': {}},
    'f1': {'metric_func': sklearn.metrics.f1_score, 'metric_args_dict': do_avg_args_dict},
}
data_splits = {'train': xbt_train_all,
            'test': xbt_test_all,
            'unseen': xbt_unseen,
           }

In [74]:
metrics_dict = pandas.DataFrame({ f'{metric_name}_instr_{split_name}': [calc_metric_subset(split1, 
                   clf_dt_instr1, 
                   {target_feature_name: fn1},
                   input_feature_names,
                   target_feature_name,
                   **metric1
                  ) for  fn1 in metrics_per_class_all[target_feature_name]]
 for metric_name,metric1 in metrics_defs_dict.items() 
 for split_name, split1 in data_splits.items()
})
metrics_dict

Unnamed: 0,recall_instr_train,recall_instr_test,recall_instr_unseen,precision_instr_train,precision_instr_test,precision_instr_unseen,accuracy_instr_train,accuracy_instr_test,accuracy_instr_unseen,f1_instr_train,f1_instr_test,f1_instr_unseen
0,0.9375,0.8125,0.76,0.9375,0.8125,0.76,0.9375,0.8125,0.76,0.9375,0.8125,0.76
1,0.780435,0.798246,0.810811,0.780435,0.798246,0.810811,0.780435,0.798246,0.810811,0.780435,0.798246,0.810811
2,0.990881,0.983282,0.980154,0.990881,0.983282,0.980154,0.990881,0.983282,0.980154,0.990881,0.983282,0.980154
3,0.921918,0.862637,0.803738,0.921918,0.862637,0.803738,0.921918,0.862637,0.803738,0.921918,0.862637,0.803738
4,0.899532,0.86875,0.855046,0.899532,0.86875,0.855046,0.899532,0.86875,0.855046,0.899532,0.86875,0.855046
5,0.071429,0.0,0.0,0.071429,0.0,0.0,0.071429,0.0,0.0,0.071429,0.0,0.0
6,0.945766,0.904282,0.883448,0.945766,0.904282,0.883448,0.945766,0.904282,0.883448,0.945766,0.904282,0.883448
7,0.939394,0.625,0.75,0.939394,0.625,0.75,0.939394,0.625,0.75,0.939394,0.625,0.75
8,0.66,0.364706,0.322222,0.66,0.364706,0.322222,0.66,0.364706,0.322222,0.66,0.364706,0.322222
9,0.988736,0.977492,0.972819,0.988736,0.977492,0.972819,0.988736,0.977492,0.972819,0.988736,0.977492,0.972819


In [75]:
y_res_train_instr_all = clf_dt_instr1.predict(X_train_all)
metrics1 = sklearn.metrics.precision_recall_fscore_support(
    y_instr_train_all, 
    y_res_train_instr_all, 
    labels=list(range(0,len(metrics_per_class_all['instrument']))))
metrics_per_class_all.update( {
    'precision_instr_train': metrics1[0],
    'recall_instr_train': metrics1[1],
    'f1_instr_train': metrics1[2],
    'support_instr_train': metrics1[3],
})
metrics_avg_all.update({
    'precision_instr_train' : sum(metrics1[0] * metrics1[3])/ sum(metrics1[3]),
    'recall_instr_train' : sum(metrics1[1] * metrics1[3])/ sum(metrics1[3]),
    'f1_instr_train' : sum(metrics1[2] * metrics1[3])/ sum(metrics1[3]),
})

In [76]:
metrics_per_class_all['accuracy_instr_train'] = [
    sklearn.metrics.accuracy_score(xbt_train_all.filter_obs({'instrument': instr1}).filter_features(['instrument']).get_ml_dataset()[0],
                                   clf_dt_instr1.predict(xbt_train_all.filter_obs({'instrument': instr1}).filter_features(input_feature_names).get_ml_dataset()[0]))
    for instr1 in metrics_per_class_all['instrument']]
metrics_avg_all['accuracy_instr_train'] = sklearn.metrics.accuracy_score(
    y_instr_train_all, 
    y_res_train_instr_all,
)

In [77]:
y_res_resampled_train_instr_all = clf_dt_instr_resampled1.predict(X_resampled_train_all)
metrics1 = sklearn.metrics.precision_recall_fscore_support(y_resampled_instr_train_all, y_res_resampled_train_instr_all, labels=list(range(0,len(metrics_per_class_all['instrument']))))
metrics_per_class_all_resampled.update( {
    'precision_instr_train': metrics1[0],
    'recall_instr_train': metrics1[1],
    'f1_instr_train': metrics1[2],
    'support_instr_train': metrics1[3],
})
metrics_avg_all_resampled.update({
    'precision_instr_train' : sum(metrics1[0] * metrics1[3])/ sum(metrics1[3]),
    'recall_instr_train' : sum(metrics1[1] * metrics1[3])/ sum(metrics1[3]),
    'f1_instr_train' : sum(metrics1[2] * metrics1[3])/ sum(metrics1[3]),
})

In [78]:
def calc_accuracy_by_class(xbt_data, class_constraint, clf1, input_features):
    xbt_subset = xbt_data.filter_obs(class_constraint)
    if xbt_subset.shape[0] == 0:
        return 0.0
    return sklearn.metrics.accuracy_score(xbt_subset.filter_features(['instrument']).get_ml_dataset()[0],
                                          clf1.predict(xbt_subset.filter_features(input_features).get_ml_dataset()[0]))
    

In [79]:
metrics_per_class_all_resampled['accuracy_instr_train'] = [
    calc_accuracy_by_class(xbt_resampled_train_all,
                           {'instrument': instr1},
                           clf_dt_instr_resampled1,
                           input_feature_names,
                          )
    for instr1 in metrics_per_class_all_resampled['instrument']]
metrics_avg_all_resampled['accuracy_instr_train'] = sklearn.metrics.accuracy_score(
    y_resampled_instr_train_all, 
    y_res_resampled_train_instr_all,
)

In [80]:
y_res_test_instr_all = clf_dt_instr1.predict(X_test_all)
metrics1 = sklearn.metrics.precision_recall_fscore_support(
    y_instr_test_all, 
    y_res_test_instr_all, 
    labels=list(range(0,len(metrics_per_class_all['instrument']))))
metrics_per_class_all.update( {
    'precision_instr_test': metrics1[0],
    'recall_instr_test': metrics1[1],
    'f1_instr_test': metrics1[2],
    'support_instr_test': metrics1[3],
})
metrics_avg_all.update({
    'precision_instr_test' : sum(metrics1[0] * metrics1[3])/ sum(metrics1[3]),
    'recall_instr_test' : sum(metrics1[1] * metrics1[3])/ sum(metrics1[3]),
    'f1_instr_test' : sum(metrics1[2] * metrics1[3])/ sum(metrics1[3]),
})

In [81]:
metrics_per_class_all['accuracy_instr_test'] = [
    calc_accuracy_by_class(xbt_test_all,
                           {'instrument': instr1},
                           clf_dt_instr1,
                           input_feature_names,
                          )
    for instr1 in metrics_per_class_all['instrument']]
metrics_avg_all['accuracy_instr_test'] = sklearn.metrics.accuracy_score(
    y_instr_test_all, 
    y_res_test_instr_all,
)

In [82]:
y_res_resampled_test_instr_all = clf_dt_instr_resampled1.predict(X_resampled_test_all)
metrics1 = sklearn.metrics.precision_recall_fscore_support(
    y_resampled_instr_test_all, 
    y_res_resampled_test_instr_all, 
    labels=list(range(0,len(metrics_per_class_all['instrument']))))
metrics_per_class_all_resampled.update( {
    'precision_instr_test': metrics1[0],
    'recall_instr_test': metrics1[1],
    'f1_instr_test': metrics1[2],
    'support_instr_test': metrics1[3],
})
metrics_avg_all_resampled.update({
    'precision_instr_test' : sum(metrics1[0] * metrics1[3])/ sum(metrics1[3]),
    'recall_instr_test' : sum(metrics1[1] * metrics1[3])/ sum(metrics1[3]),
    'f1_instr_test' : sum(metrics1[2] * metrics1[3])/ sum(metrics1[3]),
})

In [83]:
metrics_per_class_all_resampled['accuracy_instr_test'] = [
    calc_accuracy_by_class(xbt_resampled_test_all,
                           {'instrument': instr1},
                           clf_dt_instr_resampled1,
                           input_feature_names,
                          )
    for instr1 in metrics_per_class_all_resampled['instrument']]
metrics_avg_all_resampled['accuracy_instr_test'] = sklearn.metrics.accuracy_score(
    y_resampled_instr_test_all, 
    y_res_resampled_test_instr_all,
)

In [84]:
y_res_unseen_instr_all = clf_dt_instr1.predict(X_unseen_all)
metrics1 = sklearn.metrics.precision_recall_fscore_support(
    y_instr_unseen_all, 
    y_res_unseen_instr_all, 
    labels=list(range(0,len(metrics_per_class_all['instrument']))))
metrics_per_class_all.update( {
    'precision_instr_unseen': metrics1[0],
    'recall_instr_unseen': metrics1[1],
    'f1_instr_unseen': metrics1[2],
    'support_instr_unseen': metrics1[3],
})
metrics_avg_all.update({
    'precision_instr_unseen' : sum(metrics1[0] * metrics1[3])/ sum(metrics1[3]),
    'recall_instr_unseen' : sum(metrics1[1] * metrics1[3])/ sum(metrics1[3]),
    'f1_instr_unseen' : sum(metrics1[2] * metrics1[3])/ sum(metrics1[3]),
})

In [85]:
metrics_per_class_all['accuracy_instr_unseen'] = [
    calc_accuracy_by_class(xbt_unseen,
                           {'instrument': instr1},
                           clf_dt_instr1,
                           input_feature_names,
                          )
    for instr1 in metrics_per_class_all['instrument']]
metrics_avg_all['accuracy_instr_unseen'] = sklearn.metrics.accuracy_score(
    y_instr_unseen_all, 
    y_res_unseen_instr_all,
)

In [86]:
y_res_resampled_unseen_instr_all = clf_dt_instr_resampled1.predict(X_unseen_all)
metrics1 = sklearn.metrics.precision_recall_fscore_support(
    y_instr_unseen_all, 
    y_res_resampled_unseen_instr_all, 
    labels=list(range(0,len(metrics_per_class_all['instrument']))))
metrics_per_class_all_resampled.update( {
    'precision_instr_unseen': metrics1[0],
    'recall_instr_unseen': metrics1[1],
    'f1_instr_unseen': metrics1[2],
    'support_instr_unseen': metrics1[3],
})
metrics_avg_all_resampled.update({
    'precision_instr_unseen' : sum(metrics1[0] * metrics1[3])/ sum(metrics1[3]),
    'recall_instr_unseen' : sum(metrics1[1] * metrics1[3])/ sum(metrics1[3]),
    'f1_instr_unseen' : sum(metrics1[2] * metrics1[3])/ sum(metrics1[3]),
})

In [87]:
metrics_per_class_all_resampled['accuracy_instr_unseen'] = [
    calc_accuracy_by_class(xbt_unseen,
                           {'instrument': instr1},
                           clf_dt_instr_resampled1,
                           input_feature_names,
                          )
    for instr1 in metrics_per_class_all_resampled['instrument']]
metrics_avg_all_resampled['accuracy_instr_unseen'] = sklearn.metrics.accuracy_score(
    y_instr_unseen_all, 
    y_res_resampled_unseen_instr_all,
)

In [88]:
df_metrics_per_class_instr = pandas.DataFrame.from_dict({k1:v1 for k1,v1 in metrics_per_class_all.items() if 'instr' in k1})

In [89]:
df_metrics_per_class_instr_resampled = pandas.DataFrame.from_dict({k1:v1 for k1,v1 in metrics_per_class_all_resampled.items() if 'instr' in k1})

In [90]:
df_metrics_per_class_instr['num_labelled_samples'] = [sum(xbt_labelled['instrument'] == i1) for i1 in df_metrics_per_class_instr['instrument']]
df_metrics_per_class_instr = df_metrics_per_class_instr.sort_values(axis='rows', by='num_labelled_samples', ascending=False)

In [91]:
df_metrics_per_class_instr_resampled['num_labelled_samples'] = [sum(xbt_labelled['instrument'] == i1) for i1 in df_metrics_per_class_instr_resampled['instrument']]
df_metrics_per_class_instr_resampled = df_metrics_per_class_instr_resampled.sort_values(axis='rows', by='num_labelled_samples', ascending=False)

In [92]:
df_metrics_avg = pandas.DataFrame.from_dict({
    'target': ['instrument_train','instrument_test', 'instrument_unseen'],
    'precision': [v1 for k1,v1 in metrics_avg_all.items() if 'precision' in k1],
    'recall': [v1 for k1,v1 in metrics_avg_all.items() if 'recall' in k1],
    'f1': [v1 for k1,v1 in metrics_avg_all.items() if 'f1' in k1],
    'accuracy': [v1 for k1,v1 in metrics_avg_all.items() if 'accuracy' in k1],
})

In [93]:
df_metrics_avg_resampled = pandas.DataFrame.from_dict({
    'target': ['instrument_train','instrument_test', 'instrument_unseen'],
    'precision': [v1 for k1,v1 in metrics_avg_all_resampled.items() if 'precision' in k1],
    'recall': [v1 for k1,v1 in metrics_avg_all_resampled.items() if 'recall' in k1],
    'f1': [v1 for k1,v1 in metrics_avg_all_resampled.items() if 'f1' in k1],
    'accuracy': [v1 for k1,v1 in metrics_avg_all_resampled.items() if 'accuracy' in k1],
})

In [94]:
df_metrics_avg

Unnamed: 0,target,precision,recall,f1,accuracy
0,instrument_train,0.972756,0.972676,0.972387,0.972676
1,instrument_test,0.954394,0.954786,0.954303,0.954786
2,instrument_unseen,0.94478,0.945181,0.944537,0.945181


In [95]:
df_metrics_avg_resampled

Unnamed: 0,target,precision,recall,f1,accuracy
0,instrument_train,0.986032,0.986103,0.986029,0.986103
1,instrument_test,0.950546,0.915653,0.931327,0.915653
2,instrument_unseen,0.924134,0.90735,0.913391,0.90735


In [107]:
help(calc_metric_subset)

Help on function calc_metric_subset in module __main__:

calc_metric_subset(xbt_subset, clf1, filter_dict, inputs, target_feature, metric_func, metric_args_dict)



In [115]:
metrics_dict.keys()

Index(['recall_instr_train', 'recall_instr_test', 'recall_instr_unseen',
       'precision_instr_train', 'precision_instr_test',
       'precision_instr_unseen', 'accuracy_instr_train', 'accuracy_instr_test',
       'accuracy_instr_unseen', 'f1_instr_train', 'f1_instr_test',
       'f1_instr_unseen'],
      dtype='object')

In [117]:
calc_metric_subset(
    xbt_subset=xbt_unseen_part, 
    clf1=clf_dt_instr1, 
    filter_dict={target_feature_name: metrics_per_class_all[target_feature_name][2]}, 
    inputs=input_feature_names,
    target_feature=target_feature_name,
    **metrics_defs_dict['recall'])

0.9887157287157288

In [119]:
df1 = pandas.DataFrame({ metric_name: [calc_metric_subset(
    xbt_subset=xbt_unseen_part, 
    clf1=clf_dt_instr1, 
    filter_dict={target_feature_name: fn1}, 
    inputs=input_feature_names,
    target_feature=target_feature_name,
    **metric1
) for  fn1 in metrics_per_class_all[target_feature_name]] 
 for metric_name,metric1 in metrics_defs_dict.items() })
df1[target_feature_name] = [fn1 for fn1 in metrics_per_class_all[target_feature_name]]
df1

Unnamed: 0,recall,precision,accuracy,f1,instrument
0,0.9,0.9,0.9,0.9,XBT: AXBT (TSK - TSURUMI SEIKI Co.)
1,0.815385,0.815385,0.815385,0.815385,XBT: AXBT 536 (SPARTON)
2,0.988716,0.988716,0.988716,0.988716,XBT: DEEP BLUE (SIPPICAN)
3,0.862745,0.862745,0.862745,0.862745,XBT: DEEP BLUE (TSK - TSURUMI SEIKI Co.)
4,0.872549,0.872549,0.872549,0.872549,XBT: FAST DEEP (SIPPICAN)
5,0.0,0.0,0.0,0.0,XBT: SUBMARINE-LAUNCHED EXPENDABLE BATHYTHERMO...
6,0.920711,0.920711,0.920711,0.920711,XBT: T10 (SIPPICAN)
7,0.75,0.75,0.75,0.75,XBT: T10 (TSK - TSURUMI SEIKI Co.)
8,0.576923,0.576923,0.576923,0.576923,XBT: T11 (SIPPICAN)
9,0.984358,0.984358,0.984358,0.984358,XBT: T4 (SIPPICAN)


In [123]:
df2 = pandas.DataFrame({ metric_name: [calc_metric_subset(
    xbt_subset=xbt_unseen_whole, 
    clf1=clf_dt_instr1, 
    filter_dict={target_feature_name: fn1}, 
    inputs=input_feature_names,
    target_feature=target_feature_name,
    **metric1
) for  fn1 in metrics_per_class_all[target_feature_name]] 
 for metric_name,metric1 in metrics_defs_dict.items() })
df2[target_feature_name] = [fn1 for fn1 in metrics_per_class_all[target_feature_name]]
df2

Unnamed: 0,recall,precision,accuracy,f1,instrument
0,0.0,0.0,0.0,0.0,XBT: AXBT (TSK - TSURUMI SEIKI Co.)
1,0.896907,0.896907,0.896907,0.896907,XBT: AXBT 536 (SPARTON)
2,0.987921,0.987921,0.987921,0.987921,XBT: DEEP BLUE (SIPPICAN)
3,0.952663,0.952663,0.952663,0.952663,XBT: DEEP BLUE (TSK - TSURUMI SEIKI Co.)
4,0.948454,0.948454,0.948454,0.948454,XBT: FAST DEEP (SIPPICAN)
5,0.0,0.0,0.0,0.0,XBT: SUBMARINE-LAUNCHED EXPENDABLE BATHYTHERMO...
6,0.939779,0.939779,0.939779,0.939779,XBT: T10 (SIPPICAN)
7,1.0,1.0,1.0,1.0,XBT: T10 (TSK - TSURUMI SEIKI Co.)
8,0.255639,0.255639,0.255639,0.255639,XBT: T11 (SIPPICAN)
9,0.983483,0.983483,0.983483,0.983483,XBT: T4 (SIPPICAN)


# Classification result plots

The plots below show the results for the whole XBT dataset. We see that the DT classifier performs well on the training data, but does not seem to generalise well. This especially true, as one would expect, for classes with very little support in the training dataset.

In [None]:
fig_results_all_dt = matplotlib.pyplot.figure('xbt_results_all_dt', figsize=(16,40))
fig_results_all_dt.set_tight_layout(tight=True)
axis_instr_recall = fig_results_all_dt.add_subplot(4,1,1)
_ = df_metrics_per_class_instr.plot.bar(x='instrument', y=['recall_instr_train','recall_instr_test','recall_instr_unseen'],ax=axis_instr_recall)
axis_instr_precision = fig_results_all_dt.add_subplot(4,1,2)
_ = df_metrics_per_class_instr.plot.bar(x='instrument', y=['precision_instr_train','precision_instr_test','precision_instr_unseen'], ax=axis_instr_precision)
axis_instr_accuracy = fig_results_all_dt.add_subplot(4,1,3)
_ = df_metrics_per_class_instr.plot.bar(x='instrument', y=['accuracy_instr_train','accuracy_instr_test','accuracy_instr_unseen'], ax=axis_instr_accuracy)
axis_instr_support = fig_results_all_dt.add_subplot(4,1,4)
_ = df_metrics_per_class_instr.plot.bar(x='instrument',y=['support_instr_train', 'support_instr_test', 'support_instr_unseen'], ax=axis_instr_support)

In [None]:
fig_results_all_dt = matplotlib.pyplot.figure('xbt_results_all_dt_resampled', figsize=(16,30))
# fig_results_all_dt.set_tight_layout(tight=True)
axis_instr_metrics = fig_results_all_dt.add_subplot(3,1,1)
_ = df_metrics_per_class_instr_resampled.plot.bar(x='instrument', y=['recall_instr_train','recall_instr_test','recall_instr_unseen'],ax=axis_instr_metrics)
axis_instr_precision = fig_results_all_dt.add_subplot(3,1,2)
_ = df_metrics_per_class_instr_resampled.plot.bar(x='instrument', y=['precision_instr_train','precision_instr_test','precision_instr_unseen'],ax=axis_instr_precision)
axis_instr_support = fig_results_all_dt.add_subplot(3,1,3)
_ = df_metrics_per_class_instr_resampled.plot.bar(x='instrument',y=['support_instr_train', 'support_instr_test', 'support_instr_unseen'], ax=axis_instr_support)

In [None]:
pandas.DataFrame({'instrument': df_metrics_per_class_instr['instrument'], 
                  'support_train': df_metrics_per_class_instr['support_instr_train'],
                  'imbalanced_test': df_metrics_per_class_instr['recall_instr_test'], 
                  'resampled_test': df_metrics_per_class_instr_resampled['recall_instr_test'],
                  'imbalanced_unseen_recall': df_metrics_per_class_instr['recall_instr_unseen'], 
                  'resampled_unseen_recall': df_metrics_per_class_instr_resampled['recall_instr_unseen'],
                  'imbalanced_unseen_precision': df_metrics_per_class_instr['precision_instr_unseen'], 
                  'resampled_unseen_precision': df_metrics_per_class_instr_resampled['precision_instr_unseen'],
                  'imbalanced_unseen_acuracy': df_metrics_per_class_instr['accuracy_instr_unseen'],
                  'resampled_unseen_acuracy': df_metrics_per_class_instr_resampled['accuracy_instr_unseen'],
                 })

In [None]:
df_metrics_avg.plot.bar(figsize=(18,12), x='target', y='recall')

## Classification results

The contents of the XBT dataset varies over the time period, so previous papers have looked at classification accuracy (recall) year by year to evaluate how performance varies with different distribution of probe types.

To do this we apply the classifier to the train and test data for each year separetly and calculate the metrics year by year.

In [None]:
def score_year(xbt_df, year, clf, input_features, target_feature):
    xbt_year = xbt_df.filter_obs({'year': year}, )
    if xbt_year.shape[0] == 0:
        return (0.0, 0.0, 0.0, 0)
    X_year = xbt_year.filter_features(input_features).get_ml_dataset()[0]
    y_year = xbt_year.filter_features([target_feature]).get_ml_dataset()[0]
    y_res_year = clf.predict(X_year)
    metric_year = list(sklearn.metrics.precision_recall_fscore_support(
        y_year, y_res_year, average='micro'))
    metric_year += [sklearn.metrics.accuracy_score(y_year, y_res_year)]
    return metric_year

In [None]:
results_by_year = {}
results_by_year_resampled = {}

In [None]:
for year in range(env_date_ranges[environment][0],env_date_ranges[environment][1]):
    results_by_year[year] = {
        'metric_train_instr' : score_year(xbt_train_all, year, clf_dt_instr1, input_feature_names, 'instrument'),
        'metric_test_instr' : score_year(xbt_test_all, year, clf_dt_instr1, input_feature_names, 'instrument'),
        'metric_unseen_instr' : score_year(xbt_unseen, year, clf_dt_instr1, input_feature_names, 'instrument'),
    }
    results_by_year_resampled[year] = {
        'metric_train_instr' : score_year(xbt_resampled_train_all, year, clf_dt_instr_resampled1, input_feature_names, 'instrument'),
        'metric_test_instr' : score_year(xbt_resampled_test_all, year, clf_dt_instr_resampled1, input_feature_names, 'instrument'),
        'metric_unseen_instr' : score_year(xbt_unseen, year, clf_dt_instr_resampled1, input_feature_names, 'instrument'),
    }


In [None]:
recall_by_year = pandas.DataFrame.from_dict({ 
    'year':  list(results_by_year.keys()),
    'recall_train_instr' : [m1['metric_train_instr'][1] for y1,m1 in results_by_year.items()],
    'recall_test_instr' : [m1['metric_test_instr'][1] for y1,m1 in results_by_year.items()],
    'recall_unseen_instr' : [m1['metric_unseen_instr'][1] for y1,m1 in results_by_year.items()],
})
recall_by_year_resampled = pandas.DataFrame.from_dict({ 
    'year':  list(results_by_year_resampled.keys()),
    'recall_train_instr' : [m1['metric_train_instr'][1] for y1,m1 in results_by_year_resampled.items()],
    'recall_test_instr' : [m1['metric_test_instr'][1] for y1,m1 in results_by_year_resampled.items()],
    'recall_unseen_instr' : [m1['metric_unseen_instr'][1] for y1,m1 in results_by_year_resampled.items()],
})

In [None]:
accuracy_by_year = pandas.DataFrame.from_dict({ 
    'year':  list(results_by_year.keys()),
    'accuracy_train_instr' : [m1['metric_train_instr'][4] for y1,m1 in results_by_year.items()],
    'accuracy_test_instr' : [m1['metric_test_instr'][4] for y1,m1 in results_by_year.items()],
    'accuracy_unseen_instr' : [m1['metric_unseen_instr'][4] for y1,m1 in results_by_year.items()],
})
accuracy_by_year_resampled = pandas.DataFrame.from_dict({ 
    'year':  list(results_by_year_resampled.keys()),
    'accuracy_train_instr' : [m1['metric_train_instr'][4] for y1,m1 in results_by_year_resampled.items()],
    'accuracy_test_instr' : [m1['metric_test_instr'][4] for y1,m1 in results_by_year_resampled.items()],
    'accuracy_unseen_instr' : [m1['metric_unseen_instr'][4] for y1,m1 in results_by_year_resampled.items()],
})

In [None]:
instr_encoder = xbt_labelled._feature_encoders['instrument']

In [None]:
imeta_results = []
for year in range(env_date_ranges[environment][0],env_date_ranges[environment][1]):
    y_imeta_instr = instr_encoder.transform(pandas.DataFrame(imeta_instrument[xbt_labelled.xbt_df.year == year]))
    xbt_instr1 = instr_encoder.transform(pandas.DataFrame(xbt_labelled.xbt_df[xbt_labelled.xbt_df.year == year].instrument))
    (im_pr_instr, im_rec_instr, im_f1_instr, im_sup_instr) = sklearn.metrics.precision_recall_fscore_support(xbt_instr1, y_imeta_instr,average='micro')
    im_acc_instr = sklearn.metrics.accuracy_score(xbt_instr1, y_imeta_instr)
    imeta_results += [{'year': year,
                       'imeta_instr_recall': im_rec_instr,
                       'imeta_instr_precision': im_pr_instr,
                       'imeta_instr_f1': im_f1_instr,
                       'imeta_instr_accuracy': im_acc_instr,
                      }]

In [None]:
imeta_res_df = pandas.DataFrame.from_records(imeta_results)

In [None]:
results_df = pandas.merge(pandas.merge(recall_by_year, accuracy_by_year), imeta_res_df).merge(
    pandas.DataFrame.from_dict({
        'year': xbt_labelled['year'].value_counts(sort=False).index,
        'num_samples': xbt_labelled['year'].value_counts(sort=False).values,
    }))

In [None]:
results_df_resampled = pandas.merge(pandas.merge(recall_by_year_resampled, accuracy_by_year_resampled), imeta_res_df).merge(
    pandas.DataFrame.from_dict({
        'year': xbt_labelled['year'].value_counts(sort=False).index,
        'num_samples': xbt_labelled['year'].value_counts(sort=False).values,
    }))

In [None]:
fig_model_recall_results = matplotlib.pyplot.figure('xbt_model_recall', figsize=(20,10))
ax_instr_recall_results = fig_model_recall_results.add_subplot(1,2,1, title='XBT instrument recall results')
_ = results_df.plot.line(x='year',y=['recall_train_instr','recall_test_instr', 'recall_unseen_instr', 'imeta_instr_recall'], ax=ax_instr_recall_results)
ax_instr_recall_results = fig_model_recall_results.add_subplot(1,2,2, title='XBT instrument recall results - resampled train set')
_ = results_df_resampled.plot.line(x='year',y=['recall_train_instr','recall_test_instr', 'recall_unseen_instr', 'imeta_instr_recall'], ax=ax_instr_recall_results)

In [None]:
    fig_model_acc_results = matplotlib.pyplot.figure('xbt_model_accuracy', figsize=(20,10))
ax_instr_acc_results = fig_model_acc_results.add_subplot(1,2,1, title='XBT instrument accuracy results')
_ = results_df.plot.line(x='year',y=['accuracy_train_instr','accuracy_test_instr', 'accuracy_unseen_instr', 'imeta_instr_accuracy'], ax=ax_instr_acc_results)
ax_instr_acc_results = fig_model_acc_results.add_subplot(1,2,2, title='XBT instrument accuracy results - resampled train set')
_ = results_df_resampled.plot.line(x='year',y=['accuracy_train_instr','accuracy_test_instr', 'accuracy_unseen_instr', 'imeta_instr_accuracy'], ax=ax_instr_acc_results)

In [None]:
fig1 = matplotlib.pyplot.figure('recall_imb_vs_res', figsize=(16,10))
ax1 = fig1.add_subplot(1,1,1, title='comparison of recall for imbalanced and resampled training set.')
df1 = pandas.DataFrame({'instrument':  df_metrics_per_class_instr['instrument'],
'recall_unseen_imbalanced' : df_metrics_per_class_instr['recall_instr_train'], 
'recall_unseen_resampled': df_metrics_per_class_instr_resampled['recall_instr_train']})
_ = df1.plot.bar(x='instrument', ax=ax1)
fig1.savefig('/data/users/shaddad/xbt-data/plots/recall_dt_imbalance_vs_resampled.png')

In [None]:
fig1 = matplotlib.pyplot.figure('precision_imb_vs_res', figsize=(16,10))
ax1 = fig1.add_subplot(1,1,1, title='comparison of recall for imbalanced and resampled training set.')
df1 = pandas.DataFrame({'instrument':  df_metrics_per_class_instr['instrument'],
'precision_unseen_imbalanced' : df_metrics_per_class_instr['recall_instr_train'], 
'precision_unseen_resampled': df_metrics_per_class_instr_resampled['recall_instr_train']})
_ = df1.plot.bar(x='instrument', ax=ax1)
fig1.savefig('/data/users/shaddad/xbt-data/plots/precision_dt_imbalance_vs_resampled.png')