In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

Helper functions and imports

In [None]:
from copy import deepcopy
from datetime import datetime
import json
from pathlib import Path
from shutil import copy as shutil_copy

import numpy as np

In [None]:
from multimodal_molecules.models import Results, validate

In [None]:
def write_jobs(jobs):
    now = datetimeResultsw().strftime("%y_%m_%d_%H_%M_%S_jobs")
    target_directory = Path("jobs") / now
    print(f"Writing jobs to {target_directory}")
    target_directory.mkdir(exist_ok=False, parents=True)
    for ii, job in enumerate(jobs):
        path = target_directory / f"{ii:03}"
        path.mkdir(exist_ok=False, parents=False)
        with open(path / "config.json", 'w') as f:
            json.dump(job, f, indent=4, sort_keys=True)
        shutil_copy("run.py", path / "run.py")
        shutil_copy("submit.sbatch", path / "submit.sbatch")

# Pass 1

We'll just mess around with `Secondary_alcohol` for a bit to start.

`Results` keyword args

In [None]:
# These go in `rf_kwargs` ###################
ccp_alpha_values = [1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
max_features_values = [0.3, 0.4, 0.5, 0.6, 0.7]
#############################################

d = {
    "module_path": "/sdcc/u/mcarbone/Github/multimodal-molecules",
    # "module_path": "/Users/mc/GitHub/AIMM/multimodal-molecules",
    "results_kwargs": {
        "conditions": "C-XANES,O-XANES",
        "xanes_data_name": "221205_xanes.pkl",
        "index_data_name": "221205_index.csv",
        # "xanes_data_name": "/Users/mc/GitHub/AIMM/multimodal-molecules/data/221205/221205_xanes.pkl",
        # "index_data_name": "/Users/mc/GitHub/AIMM/multimodal-molecules/data/221205/221205_index.csv",
        "specific_functional_groups": ["Secondary_alcohol"],
        "val_size": 0.1,
        "test_size": 0.1,
        "rf_kwargs": {"max_samples": 0.9},
        "random_state": 1234,
    }
}


`Results.run_experiment` keyword args

In [None]:
# Output data directory is relative to the location where the script is run,
# which will be unique
d["run_experiments_kwargs"] = {
    "input_data_directory": "/hpcgpfs01/work/cfn/mcarbone/multimodal_molecules_data/data/221205",
    "output_data_directory": "results",
    "n_jobs": 12,
    "compute_feature_importance": False,
}

In [None]:
jobs = []
for ccp_alpha in ccp_alpha_values:
    for max_features in max_features_values:
        d2 = deepcopy(d)
        d2["results_kwargs"]["rf_kwargs"]["ccp_alpha"] = ccp_alpha
        d2["results_kwargs"]["rf_kwargs"]["max_features"] = max_features
        jobs.append(d2)

In [None]:
write_jobs(jobs)

## Analysis

Results are in `results/231412-hp-tuning-Secondary_alcohol`.

In [None]:
results = []
for d in Path("results/231412-hp-tuning-Secondary_alcohol").iterdir():
    results.append(Results.from_file(d / "results" / "C-XANES_O-XANES.json"))

In [None]:
ccp_alpha_dict = {val: ii for ii, val in enumerate(ccp_alpha_values)}
max_features_dict = {val: ii for ii, val in enumerate(max_features_values)}

In [None]:
validation_losses_C_only = np.zeros(shape=(len(ccp_alpha_values), len(max_features_values)))
validation_losses_O_only = np.zeros(shape=(len(ccp_alpha_values), len(max_features_values)))
validation_losses_CO = np.zeros(shape=(len(ccp_alpha_values), len(max_features_values)))

In [None]:
for result in results:
    ccp_alpha = result._rf_kwargs["ccp_alpha"]
    max_features = result._rf_kwargs["max_features"]
    ii = ccp_alpha_dict[ccp_alpha]
    jj = max_features_dict[max_features]
    
    validation_losses_C_only[ii, jj] = result.report['C-XANES-Secondary_alcohol']["val_accuracy"]
    validation_losses_O_only[ii, jj] = result.report['O-XANES-Secondary_alcohol']["val_accuracy"]
    validation_losses_CO[ii, jj] = result.report['C-XANES_O-XANES-Secondary_alcohol']["val_accuracy"]

In [None]:
validation_losses_CO

In [None]:
validation_losses_O_only

In [None]:
validation_losses_C_only

It looks like setting `ccp_alpha==0` is the way to go, and that setting `max_features==0.3` seems to be just fine.

# Pass 2

Let's also quickly check the number of estimators against the number of estimators and criterion

In [None]:
# These go in `rf_kwargs` ###################
criterion_values = ["gini", "entropy", "log_loss"]
n_estimators_values = [50, 100, 200, 300]
#############################################

d = {
    "module_path": "/sdcc/u/mcarbone/Github/multimodal-molecules",
    # "module_path": "/Users/mc/GitHub/AIMM/multimodal-molecules",
    "results_kwargs": {
        "conditions": "C-XANES,O-XANES",
        "xanes_data_name": "221205_xanes.pkl",
        "index_data_name": "221205_index.csv",
        # "xanes_data_name": "/Users/mc/GitHub/AIMM/multimodal-molecules/data/221205/221205_xanes.pkl",
        # "index_data_name": "/Users/mc/GitHub/AIMM/multimodal-molecules/data/221205/221205_index.csv",
        "specific_functional_groups": ["Secondary_alcohol"],
        "val_size": 0.1,
        "test_size": 0.1,
        "rf_kwargs": {"max_samples": 0.9},
        "random_state": 1234,
    }
}


`Results.run_experiment` keyword args

In [None]:
# Output data directory is relative to the location where the script is run,
# which will be unique
d["run_experiments_kwargs"] = {
    "input_data_directory": "/hpcgpfs01/work/cfn/mcarbone/multimodal_molecules_data/data/221205",
    "output_data_directory": "results",
    "n_jobs": 12,
    "compute_feature_importance": False,
}

In [None]:
jobs = []
for criterion in criterion_values:
    for n_estimators in n_estimators_values:
        d2 = deepcopy(d)
        d2["results_kwargs"]["rf_kwargs"]["criterion"] = criterion
        d2["results_kwargs"]["rf_kwargs"]["n_estimators"] = n_estimators
        jobs.append(d2)

In [None]:
write_jobs(jobs)

## Analysis

Results are in `results/231413-hp-tuning-Secondary_alcohol`.

In [None]:
results = []
for d in Path("results/231413-hp-tuning-Secondary_alcohol").iterdir():
    results.append(Results.from_file(d / "results" / "C-XANES_O-XANES.json"))

In [None]:
criterion_dict = {val: ii for ii, val in enumerate(criterion_values)}
n_estimators_dict = {val: ii for ii, val in enumerate(n_estimators_values)}

In [None]:
validation_losses_C_only = np.zeros(shape=(len(criterion_values), len(n_estimators_values)))
validation_losses_O_only = np.zeros(shape=(len(criterion_values), len(n_estimators_values)))
validation_losses_CO = np.zeros(shape=(len(criterion_values), len(n_estimators_values)))

In [None]:
for result in results:
    criterion = result._rf_kwargs["criterion"]
    n_estimators = result._rf_kwargs["n_estimators"]
    ii = criterion_dict[criterion]
    jj = n_estimators_dict[n_estimators]
    
    validation_losses_C_only[ii, jj] = result.report['C-XANES-Secondary_alcohol']["val_balanced_accuracy"]
    validation_losses_O_only[ii, jj] = result.report['O-XANES-Secondary_alcohol']["val_balanced_accuracy"]
    validation_losses_CO[ii, jj] = result.report['C-XANES_O-XANES-Secondary_alcohol']["val_balanced_accuracy"]

In [None]:
validation_losses_CO

In [None]:
validation_losses_O_only

In [None]:
validation_losses_C_only

Looks like the more estimators we have, the better overall. We can try a few more.

# Pass 3

Let's also quickly check the number of estimators against the number of estimators and criterion

In [None]:
# These go in `rf_kwargs` ###################
criterion_values = ["gini"]
n_estimators_values = [200, 300, 400, 500, 600, 700, 800, 900, 1000]
#############################################

d = {
    "module_path": "/sdcc/u/mcarbone/Github/multimodal-molecules",
    # "module_path": "/Users/mc/GitHub/AIMM/multimodal-molecules",
    "results_kwargs": {
        "conditions": "C-XANES,O-XANES",
        "xanes_data_name": "221205_xanes.pkl",
        "index_data_name": "221205_index.csv",
        # "xanes_data_name": "/Users/mc/GitHub/AIMM/multimodal-molecules/data/221205/221205_xanes.pkl",
        # "index_data_name": "/Users/mc/GitHub/AIMM/multimodal-molecules/data/221205/221205_index.csv",
        "specific_functional_groups": ["Secondary_alcohol"],
        "val_size": 0.1,
        "test_size": 0.1,
        "rf_kwargs": {"max_samples": 0.9},
        "random_state": 1234,
    }
}


`Results.run_experiment` keyword args

In [None]:
# Output data directory is relative to the location where the script is run,
# which will be unique
d["run_experiments_kwargs"] = {
    "input_data_directory": "/hpcgpfs01/work/cfn/mcarbone/multimodal_molecules_data/data/221205",
    "output_data_directory": "results",
    "n_jobs": 12,
    "compute_feature_importance": False,
}

In [None]:
jobs = []
for criterion in criterion_values:
    for n_estimators in n_estimators_values:
        d2 = deepcopy(d)
        d2["results_kwargs"]["rf_kwargs"]["criterion"] = criterion
        d2["results_kwargs"]["rf_kwargs"]["n_estimators"] = n_estimators
        jobs.append(d2)

In [None]:
write_jobs(jobs)

## Analysis

Results are in `results/231413-hp-tuning-Secondary_alcohol`.

In [None]:
results = []
for d in sorted(Path("results/231413-hp-tuning-Secondary_alcohol-2").iterdir()):
    results.append(Results.from_file(d / "results" / "C-XANES_O-XANES.json"))

In [None]:
validation_losses_C_only = []
validation_losses_O_only = []
validation_losses_CO = []

In [None]:
for result in results:
    validation_losses_C_only.append(result.report['C-XANES-Secondary_alcohol']["val_balanced_accuracy"])
    validation_losses_O_only.append(result.report['O-XANES-Secondary_alcohol']["val_balanced_accuracy"])
    validation_losses_CO.append(result.report['C-XANES_O-XANES-Secondary_alcohol']["val_balanced_accuracy"])

In [None]:
validation_losses_CO

In [None]:
validation_losses_O_only

In [None]:
validation_losses_C_only

Sticking with 300 estimators seems to be reasonable here.

## Tree-weighted

In [None]:
result = results[0]

In [None]:
rf = result.models["C-XANES-Secondary_alcohol"]

In [None]:
data = result.get_data("data/221205/")

In [None]:
result.models

In [None]:
train_idx, valid_idx, test_idx = result.train_val_test_indexes

In [None]:
X_val = data["C-XANES"][valid_idx, :]

In [None]:
y_val = data["FG"]["Secondary_alcohol"][valid_idx]

In [None]:
from sklearn.metrics import balanced_accuracy_score

In [None]:
balanced_accuracy_score(y_val, rf.predict(X_val))

In [None]:
def class_balanced_weighted_accuracy(rf, X_val, y_val):
    
    preds = np.array([est.predict(X_val) for est in rf.estimators_])
    accs = np.array([
        balanced_accuracy_score(y_val, pred)
        for pred in preds
    ]).reshape(-1, 1)
    total_acc = np.sum(accs)
    
    # Weight by the accuracy... simple
    return ((preds * accs) / total_acc).sum(axis=0)

In [None]:
preds = class_balanced_weighted_accuracy(rf, X_val, y_val)

In [None]:
balanced_accuracy_score(y_val, np.round(preds))

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(scores)