In [None]:
import pandas as pd
import numpy as np

from hydra.utils import instantiate
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra

from model_forge.model.model_orchastrator import ModelOrchestrator
from model_forge.data.dataset import Dataset
from model_forge.model.model_evaluator import ModelEvaluator

from data_pipeline.pipelinesteps import data_splitter

In [None]:
# Create config
GlobalHydra.instance().clear()
initialize(config_path="../../conf", version_base=None)
cfg = compose(config_name="config")

In [None]:
# Create Dataset
df = pd.read_feather("../../data/parquet_files/train/processed_train.feather")
data_pipeline = instantiate(cfg.data_pipeline)
df = data_pipeline.apply(df)  

dataset = Dataset(data=df, data_splitter=data_splitter, target_column='target', splits_columns=['train', 'test', 'OOT'])

In [None]:
for name,(y, x) in dataset.items():
    print(x)

    

In [None]:
model_orchestrator = ModelOrchestrator(cfg)

model = model_orchestrator.create_pipeline()
ME = ModelEvaluator(model[-1], ['roc_auc_ovr', 'f1'])

In [None]:
model[-1].feature_importances_ 

In [None]:
cfg.model.model_steps[1]

In [None]:
from omegaconf import DictConfig, OmegaConf
t = cfg.model.model_steps[1]
next(iter(t.items()))[1]

In [None]:
df['education_927M_last']  

In [None]:
model.fit(dataset.X_train, dataset.y_train)

In [None]:
model.transform_without_predictor(dataset.X_train)

In [None]:
ME.evaluate(model.transform_without_predictor(dataset.X_train), dataset.y_train)

In [None]:
from sklearn.metrics import get_scorer_names
all_scorers = get_scorer_names()
all_scorers

In [None]:
from sklearn.metrics import roc_auc_score

def gini(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

In [None]:
gini(dataset.y_train, model.predict_proba(dataset.X_train).T[1])



In [None]:
import numpy as np

std_dev = np.std(model.predict_proba(dataset.X_train).T[1])
print(std_dev)


In [None]:
roc_auc_score(dataset.y_train, model.predict_proba(dataset.X_train).T[1])

In [None]:
roc_auc_score(dataset.y_oot, model.predict_proba(dataset.X_oot).T[1])