In [32]:
from typing import Dict, Tuple, List, Any
import datetime as dt
import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn import metrics
import xgboost as xgb
import mlflow
from category_encoders import CountEncoder
from kedro.framework.session import KedroSession
from kedro.framework.session import get_current_session
from kedro.framework.startup import bootstrap_project
import sys
sys.path.append("../../src/hellow_kedro/pipelines/data_processing")
from dataset import fetch_data
from preprocesser import (_prepare_int_features, _prepare_binary_features, _prepare_categorical_features, _drop_unused_columns)

In [6]:
# use config with base
metadata = bootstrap_project(Path.cwd().parent.parent)
with KedroSession.create(metadata.package_name,
        project_path=metadata.project_path,
        # save_on_close=True,
        env=None,
        # extra_params=extra_params
    ) as session: 
    context = session.load_context()


fatal: Needed a single revision


2021-12-19 06:01:54,341 - kedro.framework.session.store - INFO - `save()` not implemented for `BaseSessionStore`. Skipping the step.


In [2]:
context.params

NameError: name 'context' is not defined

In [28]:
def download_data(date_hour: dt.datetime, params: Dict[str, Any]):
    df = fetch_data(date_hour, params)
    return df

In [1]:
df = download_data(dt.datetime(2021,12,17,10), context.params["dataset_params"])
context.catalog.save("raw_data", df)

NameError: name 'download_data' is not defined

In [13]:
def preprocess(
    raw_X: pd.DataFrame,
    feat_parameters: Dict[str, Any],
    encoder: CountEncoder=None,
) -> Tuple[pd.DataFrame, CountEncoder]:
    y = raw_X[feat_parameters["label"]].values.flatten()
    X = raw_X.drop(
        feat_parameters["label"]
        + feat_parameters["sample_weight"]
        + feat_parameters["postback_type"]
        + feat_parameters["bias_adjust_weight"],
        axis=1,
    )
    _prepare_int_features(X, feat_parameters["int_features"])
    _prepare_binary_features(X, feat_parameters["binary_features"])
    _drop_unused_columns(X, feat_parameters["drop_optional"])
    encoder = _prepare_categorical_features(
        X,
        raw_X,
        feat_parameters["categorical_features"],
        feat_parameters["all_features"],
        encoder,
    )

    return (
        X[feat_parameters["all_features"]],
        y,
        encoder,
    )

In [14]:
df_X, y, encoder = preprocess(
    context.catalog.load("raw_data")
    ,context.params["features_params"]
)
context.catalog.save("encoder", encoder)
context.catalog.save("features", df_X)
context.catalog.save("labels", y)


2021-12-19 05:31:31,380 - kedro.io.data_catalog - INFO - Loading data from `raw_data` (PickleDataSet)...
2021-12-19 05:31:32,937 - kedro.io.data_catalog - INFO - Saving data to `encoder` (PickleDataSet)...
2021-12-19 05:31:32,962 - kedro.io.data_catalog - INFO - Saving data to `features` (PickleDataSet)...
2021-12-19 05:31:32,985 - kedro.io.data_catalog - INFO - Saving data to `labels` (PickleDataSet)...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[c] = np.where(df_raw[c].isnull(), float("nan"), X[c])


In [3]:
df_X

NameError: name 'df_X' is not defined

In [19]:
def fit(
    X: pd.DataFrame,
    y: np.ndarray,
    params: Dict,
) -> xgb.sklearn.XGBClassifier:
    clf = xgb.XGBClassifier(**params)
    clf.fit(X.values, y)
    return clf

In [20]:
model = fit(
    context.catalog.load("features")
    ,context.catalog.load("labels")
    ,context.params["model_params"]["hyper_parameters"]
)
context.catalog.save("model", model)

2021-12-19 05:34:31,385 - kedro.io.data_catalog - INFO - Loading data from `features` (PickleDataSet)...
2021-12-19 05:34:31,407 - kedro.io.data_catalog - INFO - Loading data from `labels` (PickleDataSet)...
2021-12-19 05:34:31,814 - kedro.io.data_catalog - INFO - Saving data to `model` (PickleDataSet)...


In [3]:
def predict(model: xgb.sklearn.XGBClassifier, X_test: pd.DataFrame):
    return model.predict_proba(X_test)[:,1]

In [8]:
y_hat = predict(
    context.catalog.load("model")
    ,context.catalog.load("features")
)
context.catalog.save("y_hat", y_hat)

2021-12-19 06:02:10,439 - kedro.io.data_catalog - INFO - Loading data from `model` (PickleDataSet)...
2021-12-19 06:02:10,473 - kedro.io.data_catalog - INFO - Loading data from `features` (PickleDataSet)...
2021-12-19 06:02:10,542 - kedro.io.data_catalog - INFO - Saving data to `y_hat` (PickleDataSet)...


In [40]:
def report_accuracy(predictions: np.ndarray, test_y: np.ndarray, parameters: Dict[str, Any]) -> None:
    fpr, tpr, _ = metrics.roc_curve(test_y, predictions)
    _auc = metrics.auc(fpr, tpr)
    
    with mlflow.start_run():
        mlflow.log_metric("auc", _auc)
        mlflow.log_param("time of prediction", str(dt.datetime.now()))
        mlflow.log_param("model_params", parameters)
    
    return json.dumps({"auc": _auc})

In [42]:
report = report_accuracy(
    context.catalog.load("y_hat")
    ,context.catalog.load("labels")
    ,context.params["model_params"]["hyper_parameters"]
)
context.catalog.save("report", report)

2021-12-19 06:18:14,923 - kedro.io.data_catalog - INFO - Loading data from `y_hat` (PickleDataSet)...
2021-12-19 06:18:14,943 - kedro.io.data_catalog - INFO - Loading data from `labels` (PickleDataSet)...
2021-12-19 06:18:14,993 - kedro.io.data_catalog - INFO - Saving data to `report` (JSONDataSet)...
