In [None]:
import pandas as pd
import numpy as np

from hydra.utils import instantiate
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra

from data_pipeline.pipelinesteps import data_splitter
from model.modelorchastrator import ModelOrchestrator
from data_pipeline.dataset import Dataset
from evaluate.metric_eval import ModelEvaluator

In [None]:
# Create config
GlobalHydra.instance().clear()
initialize(config_path="../../", version_base=None)
cfg = compose(config_name="config")

In [None]:
# Create Dataset
df = pd.read_feather("../../data/parquet_files/train/processed_train.feather")
data_pipeline = instantiate(cfg.data_pipeline)
df = pd.read_feather("../../data/parquet_files/train/processed_train.feather")
df = data_pipeline.apply(df)

dataset = Dataset(data=df, data_splitter=data_splitter, target_column='target')

In [None]:
model_orchestrator = ModelOrchestrator(cfg.model)

model = model_orchestrator.modelpipeline
ME = ModelEvaluator(model.pipeline[-1], ['roc_auc_ovr', 'f1'])

In [None]:
model.fit(dataset.X_train, dataset.y_train)

In [None]:
model.transform_without_predictor(dataset.X_train)

In [None]:
ME.evaluate(model.transform_without_predictor(dataset.X_train), dataset.y_train)

In [None]:
from sklearn.metrics import get_scorer_names
all_scorers = get_scorer_names()
all_scorers

In [None]:
from sklearn.metrics import roc_auc_score

def gini(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

In [None]:
gini(dataset.y_train, model.predict_proba(dataset.X_train).T[1])

In [None]:
df0 = pd.read_parquet("../../data/parquet_files/train/train_static_0_0.parquet")
df1 = pd.read_parquet("../../data/parquet_files/train/train_static_0_1.parquet")
df = pd.concat([df0, df1])
df = df[["case_id", "annuitynextmonth_57A", "amtinstpaidbefduel24m_4187115A", "annuity_780A", "applicationscnt_867L" , "avglnamtstart24m_4525187A", "credamount_770A", "eir_270L", "inittransactionamount_650A", "homephncnt_628L", "lastrejectcredamount_222A"]]

In [None]:
df.head()

In [None]:
X = pd.merge(dataset.X_train , df, on = "case_id", how = "left")
X_test = pd.merge(dataset.X_test , df, on = "case_id", how = "left")
X_oot = pd.merge(dataset.X_oot , df, on = "case_id", how = "left")

In [None]:
model_orchestrator = ModelOrchestrator(cfg.model)

model = model_orchestrator.modelpipeline
ME = ModelEvaluator(model.pipeline[-1], ['roc_auc_ovr', 'f1'])
model.fit(X, dataset.y_train)
# ME.evaluate(model.transform_without_predictor(X), dataset.y_train)

In [None]:
gini_value = gini(dataset.y_oot, model.predict_proba(X_oot).T[1])
std_dev = np.std(model.predict_proba(X_oot).T[1])
gini_value - 0.5 * std_dev

In [None]:
gini_value


In [None]:
roc_auc_score(dataset.y_oot, model.predict_proba(X_oot).T[1])

In [None]:
roc_auc_score(dataset.y_train, model.predict_proba(X).T[1])