In [96]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from collections import defaultdict

In [136]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import json
from google.cloud import storage


input_bucket_path = "gs://berkabank/production/data/"
# Load the data
data = {
    "training_drivers": pd.read_csv(
        f"{input_bucket_path}05_features/training_drivers.csv"
    ),
    "core_training": pd.read_csv(f"{input_bucket_path}04_processing/core_training.csv"),
    "training_features": pd.read_csv(
        f"{input_bucket_path}05_features/training_features.csv"
    ),
    "eod_balance_training":pd.read_csv(f"{input_bucket_path}04_processing/eod_balance_training.csv")
}

In [138]:
data["training_drivers"]

Unnamed: 0,account_id,n_transactions,days_since_account_creation
0,1,4858.0,44717
1,2,39309.0,200270
2,4,2582.0,37413
3,6,23927.0,177388
4,9,8810.0,69553
...,...,...,...
2144,11320,62533.0,362963
2145,11333,32853.0,163370
2146,11349,1.0,91
2147,11362,3956.0,31216


In [139]:
def custom_f(series):
    return np.sum(series) + 1


X = data["training_drivers"].set_index("account_id")

y = data["core_training"]["target"]

In [140]:
from dataclasses import dataclass
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit
import pandas as pd


@dataclass
class HyperparameterTuning:
    estimator: any
    params: dict
    scoring: str
    n_iter: int
    n_splits: int
    random_state: int

    def perform_search(self, X, y):
        model = RandomizedSearchCV(
            estimator=self.estimator,
            param_distributions=self.params,
            scoring=self.scoring,
            cv=StratifiedShuffleSplit(
                n_splits=self.n_splits, test_size=0.1, random_state=42
            ),
            n_iter=self.n_iter,
            random_state=42,
            n_jobs=-1,
        )
        grid = model.fit(X, y)
        return grid

    def run(self, X, y):
        grid = self.perform_search(X, y)
        report = (
            pd.DataFrame(grid.cv_results_)
            .sort_values("rank_test_score")
            .loc[:, ["rank_test_score", "mean_test_score", "std_test_score", "params"]]
            .set_index("rank_test_score")
        )

        best = {
            "best_estimator": grid.best_estimator_,
            "best_params": grid.best_params_,
            "best_score": grid.best_score_,
        }
        return {"best": best, "report": report}


In [141]:
hp_processor_output["report"]

Unnamed: 0_level_0,mean_test_score,std_test_score,params
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.358852,0.155082,"{'n_estimators': 50, 'min_samples_split': 5, '..."
2,0.35784,0.147657,"{'n_estimators': 50, 'min_samples_split': 10, ..."
3,0.357472,0.147363,"{'n_estimators': 200, 'min_samples_split': 2, ..."
4,0.353338,0.144283,"{'n_estimators': 100, 'min_samples_split': 10,..."
5,0.352429,0.143363,"{'n_estimators': 200, 'min_samples_split': 10,..."
6,0.347818,0.137334,"{'n_estimators': 50, 'min_samples_split': 2, '..."
7,0.347511,0.140145,"{'n_estimators': 200, 'min_samples_split': 10,..."
8,0.347437,0.149459,"{'n_estimators': 50, 'min_samples_split': 5, '..."
9,0.34656,0.140075,"{'n_estimators': 50, 'min_samples_split': 2, '..."
10,0.340651,0.132272,"{'n_estimators': 100, 'min_samples_split': 5, ..."


In [None]:
from google.cloud import storage
import joblib

model_name = "model"
storage_client = storage.Client()
bucket = storage_client.get_bucket("berkabank")

# Save the model to a local file
joblib.dump(best["best_estimator"], f'{model_name}.joblib')

blob = bucket.blob(f"production/artifacts/model/{model_name}.joblib")

# Upload the local file to the cloud storage
with open(f'{model_name}.joblib', 'rb') as model_file:
    blob.upload_from_file(model_file)

In [142]:
# Split train and validation
X = data["training_drivers"].set_index("account_id")
y = data["core_training"].set_index("account_id")["target"]
X

Unnamed: 0_level_0,n_transactions,days_since_account_creation
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4858.0,44717
2,39309.0,200270
4,2582.0,37413
6,23927.0,177388
9,8810.0,69553
...,...,...
11320,62533.0,362963
11333,32853.0,163370
11349,1.0,91
11362,3956.0,31216


In [147]:
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import pandas as pd
import json
import numpy as np
from google.cloud import storage
import joblib

# ----------------   Hyperparameter Tuning and Validation ------------------
# Split train and validation
X = data["training_drivers"].set_index("account_id")
y = data["core_training"].set_index("account_id")["target"]

# Balance cv and validation folds
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

for train_index, test_index in split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Pick the Model Estimator #TODO: serialize in input
estimator = RandomForestClassifier()

# Run HPT
hp_processor = HyperparameterTuning(
    estimator=estimator,
    params=params,
    n_splits=10,
    scoring="roc_auc",
    n_iter=10,
    random_state=42,
)
hp_processor_output = hp_processor.run(X_train, y_train)
for train_index, test_index in split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
best = hp_processor_output["best"]

# Assess Validation and Optimal CutOff
best_estimator = estimator.set_params(**best["best_params"])
best_estimator.fit(X_train, y_train)
y_scores = estimator.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"TEST ROC AUC: {roc_auc}")
print(f"Optimal threshold: {optimal_threshold}")
best["optimal_threshold"] = optimal_threshold
# --------------------------------------------------------------------------

TEST ROC AUC: 0.5078703703703704
Optimal threshold: 0.41579500370742006


In [148]:
hp_processor_output["report"]

Unnamed: 0_level_0,mean_test_score,std_test_score,params
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.496858,0.033049,"{'n_estimators': 200, 'min_samples_split': 10,..."
2,0.494092,0.033586,"{'n_estimators': 50, 'min_samples_split': 2, '..."
3,0.485906,0.041223,"{'n_estimators': 50, 'min_samples_split': 2, '..."
4,0.485758,0.033434,"{'n_estimators': 100, 'min_samples_split': 10,..."
5,0.484808,0.034748,"{'n_estimators': 50, 'min_samples_split': 10, ..."
6,0.482309,0.035975,"{'n_estimators': 200, 'min_samples_split': 10,..."
7,0.478916,0.034322,"{'n_estimators': 50, 'min_samples_split': 5, '..."
8,0.478876,0.033949,"{'n_estimators': 100, 'min_samples_split': 5, ..."
9,0.478324,0.033001,"{'n_estimators': 200, 'min_samples_split': 2, ..."
10,0.476178,0.037257,"{'n_estimators': 50, 'min_samples_split': 5, '..."


In [149]:
pd.read_csv(f"{input_bucket_path}08_reporting/hpt_report_latest.csv")

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
0,1,0.507536,0.04607,"{'n_estimators': 50, 'min_samples_split': 2, '..."
1,2,0.496255,0.054345,"{'n_estimators': 200, 'min_samples_split': 10,..."
2,3,0.495378,0.044765,"{'n_estimators': 50, 'min_samples_split': 2, '..."
3,4,0.492133,0.037279,"{'n_estimators': 200, 'min_samples_split': 2, ..."
4,5,0.489902,0.031421,"{'n_estimators': 50, 'min_samples_split': 5, '..."
5,6,0.484062,0.032807,"{'n_estimators': 50, 'min_samples_split': 10, ..."
6,7,0.483561,0.040112,"{'n_estimators': 50, 'min_samples_split': 5, '..."
7,8,0.482946,0.039422,"{'n_estimators': 100, 'min_samples_split': 5, ..."
8,9,0.481637,0.038767,"{'n_estimators': 200, 'min_samples_split': 10,..."
9,10,0.48051,0.034346,"{'n_estimators': 100, 'min_samples_split': 10,..."
