In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, Perceptron, SGDClassifier, OrthogonalMatchingPursuit
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split, ParameterGrid, KFold, StratifiedKFold
from sklearn.dummy import DummyClassifier
from datetime import datetime
from sqlalchemy import (create_engine, MetaData, Table, Column, insert, func,
                        Integer, String, Numeric, DateTime, Enum)
from sqlalchemy.dialects.postgresql import UUID, JSONB
import xgboost as xgb
import hashlib

import pickle
from statistics import mean
import random
import json
import uuid
import enum


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import warnings
warnings.filterwarnings('ignore')

In [None]:
clf_fcns = {
    'DC': DummyClassifier(),
    'RF': RandomForestClassifier(n_jobs=-1, random_state=42),
    'ET': ExtraTreesClassifier(n_jobs=-1, criterion='entropy',random_state=42),
    'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", random_state=42),
    'LR': LogisticRegression(random_state=42,solver='liblinear'),
    'SVM': svm.SVC(kernel='linear', probability=True, random_state=42),
    'GB': GradientBoostingClassifier(random_state=42),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(random_state=42),
    'SGD': SGDClassifier(loss="hinge", random_state=42),
    'KNN': KNeighborsClassifier(),
    'LRR': Ridge(random_state=42),
    'LRL': Lasso(random_state=42),
    'XGB': xgb.XGBClassifier(n_jobs=-1,random_state=42)
    }

In [None]:
class Trainer:
    def __init__(
        self,
        experiment_group_id,
        experiment_config_name,
        dataset_name,
        X,
        y,
        clfs,
        hyperparameters,
        split_methods,
        split_random_states,
        metrics,
        db=False,
        sub_groups=None,
    ):
        self.experiment_group_id = experiment_group_id
        self.experiment_config_name = experiment_config_name
        self.dataset_name = dataset_name
        self.X = X
        self.y = y
        self.split_random_states = split_random_states
        self.split_methods = split_methods
        self.set_clfs(clfs)
        self.set_hyperparemeters(hyperparameters)
        self.db = db
        self.metrics = metrics
        self.sub_groups = sub_groups
        self.results_df = pd.DataFrame(
            columns=(
                "experiment_output_id",
                "experiment_group_id",
                "experiment_config_name",
                "dataset_name",
                "model_type",
                "model_parameters",
                "sub_group",
                "metric_name",
                "metric_k",
                "metric_score",
                "split_method",
                "split_seed",
                "split_percentage",
                "split_num_fold",
                "split_num",
                "created_at",
            )
        )

    # Check for presets
    def set_hyperparemeters(self, hyperparemeters):
        if hyperparemeters == "small":
            self.hyperparemeters = small_grid
        elif hyperparemeters == "large":
            self.hyperparemeters = large_grid
        elif hyperparemeters == "test":
            self.hyperparemeters = test_grid
        else:
            self.hyperparemeters = hyperparemeters

    # Check for presets
    def set_clfs(self, clfs):
        if clfs == "all":
            self.clfs = clf_list_all
        elif clfs == "test":
            self.clfs = clf_list_test
        else:
            self.clfs = clfs

    def save_ouput(
        self,
        scores_obj,
        model_type,
        model_parameters,
        sub_group_metadata,
        split_method,
        split_seed,
        split_percentage,
        split_num_fold,
        split_num,
    ):

        experiment_group_id = self.experiment_group_id
        experiment_config_name = self.experiment_config_name
        dataset_name = self.dataset_name

        if sub_group_metadata:
            sub_group_json = json.dumps(dict(sub_group_metadata))
        else:
            sub_group_json = None

        model_parameters_json = json.dumps(model_parameters)

        for metric_key in scores_obj:
            if "@" in metric_key:
                metric_name, metric_k = metric_key.split("@")
            else:
                metric_name = metric_key
                metric_k = None

            if self.db == False:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")

                self.results_df.loc[len(self.results_df)] = [
                    len(self.results_df),  # experiment_output_id
                    experiment_group_id,  # experiment_group_id
                    experiment_config_name,
                    dataset_name,
                    model_type,
                    model_parameters_json,  # model_parameters
                    sub_group_json,  # sub_group
                    metric_name,
                    metric_k,  # metric_score
                    scores_obj[metric_key],
                    split_method,
                    split_seed,
                    split_percentage,
                    split_num_fold,
                    split_num,
                    dt_string,  # created_at
                ]
            elif self.db == True:
                ins = experiment_outputs_table.insert().values(
                    experiment_group_id=experiment_group_id,
                    experiment_config_name=experiment_config_name,
                    dataset_name=dataset_name,
                    model_type=model_type,
                    model_parameters=model_parameters_json,
                    sub_group=sub_group_json,
                    metric_name=metric_name,
                    metric_k=metric_k,
                    metric_score=scores_obj[metric_key],
                    split_method=split_method,
                    split_seed=split_seed,
                    split_percentage=split_percentage,
                    split_num_fold=split_num_fold,
                    split_num=split_num,
                )
                result = engine.execute(ins)

    def train_models(self):
        # Initialize the evaluator
        evaluator = ModelEvaluator(metrics=self.metrics)

        print("DEBUG: Starting model training")

        models_to_run = list(self.clfs.keys())
        grid = self.hyperparemeters
        i = 0
        for index, clf in enumerate([self.clfs[x] for x in models_to_run]):
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                clf.set_params(**p)
                i += 1
                print(f"DEBUG: {self.dataset_name}: Training ( # {i} ) {clf} | {p}")

                for split_seed in self.split_random_states:
                    for split_method in self.split_methods:

                        if split_method.find("@") < 0:
                            raise ValueError("You must define a number of folds or percentage for the split functions.")
                        else:
                            split_function = split_method[0 : split_method.find("@")]
                            split_num_fold = int(split_method[split_method.find("@") + 1 :])

                        if split_function == "train_test_split":
                            X_train, X_test, y_train, y_test = train_test_split(
                                self.X, self.y, test_size=split_num_fold / 100, random_state=split_seed
                            )
                            clf.fit(X_train, y_train)

                            evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                            scores_obj = evaluator.get_metrics()
                            self.save_ouput(
                                scores_obj=scores_obj,
                                model_type=models_to_run[index],
                                model_parameters=p,
                                sub_group_metadata=None,
                                split_method=split_function,
                                split_seed=split_seed,
                                split_percentage=split_num_fold,
                                split_num_fold=None,
                                split_num=1,
                            )

                            # Subgroup metrics
                            for sub_group_var in self.sub_groups:
                                for sub_group_value in list(X_test[sub_group_var].unique()):
                                    X_test_temp = X_test[X_test[sub_group_var] == sub_group_value]
                                    y_test_temp = y_test.loc[X_test[sub_group_var] == sub_group_value]

                                    evaluator.load(X_test=X_test_temp, y_true=y_test_temp, clf=clf)
                                    scores_obj = evaluator.get_metrics()
                                    sub_group_dict = {sub_group_var: int(sub_group_value)}

                                    self.save_ouput(
                                        scores_obj=scores_obj,
                                        model_type=models_to_run[index],
                                        model_parameters=p,
                                        sub_group_metadata=sub_group_dict,
                                        split_method=split_function,
                                        split_seed=split_seed,
                                        split_percentage=split_num_fold,
                                        split_num_fold=None,
                                        split_num=1,
                                    )

                        elif split_function == "StratifiedKFold" or split_function == "KFold":

                            kf = eval(
                                f"{split_function}(n_splits={split_num_fold}, random_state={split_seed}, shuffle=True)"
                            )
                            split_num = 0
                            for train_index, test_index in kf.split(self.X, self.y):
                                split_num += 1
                                X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                                y_train, y_test = self.y[train_index], self.y[test_index]
                                clf.fit(X_train, y_train)

                                # Here we run the model evalutor and save the stats
                                evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                                scores_obj = evaluator.get_metrics()
                                self.save_ouput(
                                    scores_obj=scores_obj,
                                    model_type=models_to_run[index],
                                    model_parameters=p,
                                    sub_group_metadata=None,
                                    split_method=split_function,
                                    split_seed=split_seed,
                                    split_percentage=None,
                                    split_num_fold=split_num_fold,
                                    split_num=split_num,
                                )

                                # Subgroup metrics
                                for sub_group_var in self.sub_groups:
                                    for sub_group_value in list(X_test[sub_group_var].unique()):
                                        X_test_temp = X_test[X_test[sub_group_var] == sub_group_value]
                                        y_test_temp = y_test.loc[X_test[sub_group_var] == sub_group_value]

                                        evaluator.load(X_test=X_test_temp, y_true=y_test_temp, clf=clf)
                                        scores_obj = evaluator.get_metrics()

                                        sub_group_dict = {sub_group_var: int(sub_group_value)}

                                        self.save_ouput(
                                            scores_obj=scores_obj,
                                            model_type=models_to_run[index],
                                            model_parameters=p,
                                            sub_group_metadata=sub_group_dict,
                                            split_method=split_function,
                                            split_seed=split_seed,
                                            split_percentage=None,
                                            split_num_fold=split_num_fold,
                                            split_num=split_num,
                                        )
        print("DEBUG: Model training complete")


class ModelEvaluator:
    def __init__(self, clf=None, X_test=None, y_true=None, metrics=None):
        self.clf = clf
        self.X_test = X_test
        self.y_true = y_true
        self.metrics = metrics

    def load(self, X_test, y_true, clf):
        self.X_test = X_test
        self.y_true = y_true
        self.clf = clf
        if (type(self.clf) == type(Ridge())) or (type(self.clf) == type(Lasso())):
            self.y_score = self.clf.predict(self.X_test)
            self.y_pred = np.where(self.y_score >= 0.5, 1, 0)
        else:
            self.y_pred = self.clf.predict(self.X_test)
            self.y_score = self.clf.predict_proba(self.X_test)[:, 1]

    # Check for presets
    def set_metrics(self, metrics):
        if metrics == "small":
            self.metrics = metric_list_small
        elif metrics == "test":
            self.metrics = metric_list_test
        else:
            self.metrics = metrics

    def metric_at_k(self, metric, k):
        y_pred = np.where(self.y_score > np.percentile(self.y_score, (100 - k)), 1, 0)
        s = eval(metric + "(self.y_true,y_pred)")
        return s

    def get_metrics(self):
        results = {}
        for metric in self.metrics:
            if metric.find("@") > -1:
                m = metric[0 : metric.find("@")]
                k = int(metric[metric.find("@") + 1 :])
                s = self.metric_at_k(metric=m, k=k)
            else:
                s = eval(metric + "(self.y_true,self.y_pred)")
            results[metric] = s
        return results

In [None]:
# Read json
root_list = !echo "${HOME}/ml-explainability"
root_path = str(root_list[0])

config_name = 'baseline_experiment_drugs_all_models'
config_path = f'{root_path}/configs/{config_name}.json'

with open(config_path, 'r') as f:
    config = json.load(f)

clfs = {}
for clf in config['hyperparameters']:
    clfs[clf] = clf_fcns[clf]

In [None]:
# Read Postgres node address
postgres_node_file = '/scratch/isk273/postgres_node.txt'
with open(postgres_node_file, 'r') as f:
    postgres_address = f.read().replace('\n', '')

In [None]:
POSTGRES_ADDRESS = postgres_address
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'isk273'
POSTGRES_PASSWORD = 'andrewIan'
POSTGRES_DBNAME = 'explainability_db'

postgres_str = (
    f'postgresql://{POSTGRES_USERNAME}:{POSTGRES_PASSWORD}@{POSTGRES_ADDRESS}:{POSTGRES_PORT}/{POSTGRES_DBNAME}'
)

# Create the connection
engine = create_engine(postgres_str, echo = True)

In [None]:
class DatasetNameEnum(enum.Enum):
    education = 1
    healthcare = 2
    housing = 3

In [None]:
class SplitMethodEnum(enum.Enum):
    StratifiedKFold = 1
    KFold = 2
    train_test_split = 3

In [None]:
metadata = MetaData(engine)
experiment_outputs_table = Table(
    "experiment_outputs",
    metadata,
    Column("experiment_output_id", UUID(as_uuid=True), primary_key=True, default=uuid.uuid4),
    Column("experiment_group_id", String, nullable=False),
    Column("experiment_config_name", String, nullable=False),
    Column("dataset_name", Enum(DatasetNameEnum), nullable=False),
    Column("model_type", String(3), nullable=False),
    Column("model_parameters", JSONB, nullable=False),
    Column("sub_group", JSONB),
    Column("metric_name", String, nullable=False),
    Column("metric_k", Integer),
    Column("metric_score", Numeric, nullable=False),
    Column("split_method", Enum(SplitMethodEnum), nullable=False),
    Column("split_seed", Integer, nullable=False),
    Column("split_num_fold", Integer),
    Column("split_percentage", Integer),
    Column("split_num", Integer, nullable=False),
    Column("created_at", DateTime(timezone=True), server_default=func.now(), nullable=False),
)

metadata.create_all()


In [None]:
experiment_group_id = hashlib.md5((str(config)+datetime.now().strftime("%d/%m/%Y %H:%M:%S")).encode('utf-8')).hexdigest()

In [None]:
import warnings

warnings.filterwarnings("ignore")

for i in range(0, len(config["datasets"])):
    dataset_name = config["datasets"][i]
    X_path = f"{root_path}/{config['X'][dataset_name]}"
    y_path = f"{root_path}/{config['y'][dataset_name]}"

    X = pd.read_csv(X_path, index_col=0)
    print(f"DEBUG: {dataset_name}: {X.shape[0]} rows | {X.shape[1]} cols")
    y = pd.read_csv(y_path, index_col=0, squeeze=True)
    try:
        sub_groups = config["sub_groups"][dataset_name]
    except KeyError:
        sub_groups = None
    trainer = Trainer(
        experiment_config_name=config_name,
        experiment_group_id=experiment_group_id,
        dataset_name=dataset_name,
        X=X,
        y=y,
        clfs=clfs,
        hyperparameters=config["hyperparameters"],
        split_methods=config["split_methods"],
        split_random_states=config["split_random_states"],
        metrics=config["metrics"],
        sub_groups=sub_groups,
        db=config["save_to_db"],
    )
    trainer.train_models()