In [None]:
from model.modelorchastrator import ModelOrchestrator
import pandas as pd
from data_pipeline.dataset import Dataset

from hydra.utils import instantiate
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from data_pipeline.pipelinesteps import data_splitter

In [None]:
df = pd.read_feather("../../data/parquet_files/train/processed_train.feather")
df.head()

In [None]:

GlobalHydra.instance().clear()
initialize(config_path="../../", version_base=None)
cfg = compose(config_name="config")

data_pipeline = instantiate(cfg.data_pipeline)
df = pd.read_feather("../../data/parquet_files/train/processed_train.feather")
df = data_pipeline.apply(df)

dataset = Dataset(data=df, data_splitter=data_splitter, target_column='target')

In [None]:
model_orchestrator = ModelOrchestrator(cfg.model)

In [None]:
pipeline = model_orchestrator.modelpipeline

In [None]:
dataset.X_train.head()

In [None]:


# Split the data into training and test sets
X_train, X_test, y_train, y_test = dataset.X_train, dataset.X_test, dataset.y_train, dataset.y_test 
pipeline.fit(dataset.X_train, dataset.y_train)  

In [None]:
from sklearn.metrics import roc_auc_score

# Predict the probabilities of the positive class
y_pred_proba = pipeline.predict_proba(X_test).T[1]

# Compute the AUC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC score: {auc_score}")

In [None]:
pd.Series(y_pred_proba).hist(bins=50, figsize=(10, 6))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)  
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_score))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


In [None]:
test_data = pd.read_feather("../../data/parquet_files/test/processed_test.feather")
test_data.head()


In [None]:

test_data = pd.read_feather("../../data/parquet_files/test/processed_test.feather")
test_data = data_pipeline.apply(test_data)
test_data['target'] = 0
test_dataset = Dataset(data=test_data, data_splitter=data_splitter, target_column='target')


In [None]:
pipeline.predict_proba(test_dataset.X.reset_index())