In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

#import xgboost as xgb
#import lightgbm as lgb

from hyperopt import hp, tpe, fmin, Trials
from hyperopt.pyll.base import scope

import mlflow
import mlflow.sklearn
#import mlflow.lightgbm

import sys
import os
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/src')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/utils')

from src.lightgbm_model import *
from src.xgboost_model import *
from src.random_forest_model import *

from utils.metrics import *
from utils.plots import *
from utils.model_io import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
sys.path

['/Library/Frameworks/Python.framework/Versions/3.9/lib/python39.zip',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/lib-dynload',
 '',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/venv1/lib/python3.9/site-packages',
 '/Users/ben/Desktop/py_proj/account_funds_prediction',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/src',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/utils']

In [3]:
os.getcwd()

'/Users/ben/Desktop/py_proj/account_funds_prediction/notebook'

## 1. Data prep

In [4]:
df = pd.read_csv('/Users/ben/Desktop/py_proj/account_funds_prediction/data/df.csv')

In [5]:
df.head()

Unnamed: 0,id,funds_now,funds_after_6months,age,tenure,credit_score,annual_income,mtg_balance,credit_card_balance,loan_balance,...,fea41,fea42,fea43,fea44,fea45,fea46,fea47,fea48,fea49,fea50
0,1,580888.6,411571.0,69,44,443,158191,211023.6,13320.2,11516.5,...,10232.7,26721.0,39983.1,45532.4,11885.7,1,1,1,0,1
1,2,372438.7,157944.4,32,43,496,208050,19841.3,45983.3,119858.9,...,-26726.7,-19164.4,22911.3,-9997.3,3621.1,1,0,0,0,0
2,3,645639.9,22235.5,89,55,527,148731,854416.8,41016.9,31300.6,...,-15244.7,4683.6,32100.5,-32710.4,39201.2,1,1,1,1,0
3,4,1191515.2,316512.4,78,1,628,263843,850936.3,2612.9,73886.1,...,18567.4,28572.1,-44307.5,-43039.7,156.0,1,1,1,1,0
4,5,348260.8,0.0,38,16,764,264430,979270.8,5583.3,252329.3,...,-15909.1,-33315.5,-32351.8,15200.4,-7349.3,0,0,1,1,0


In [6]:
df.shape

(200000, 52)

In [7]:
# train test split
X = df.drop(columns=['id', 'funds_after_6months'])
y = df['funds_after_6months']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## 2. Baseline Model

In [8]:
baseline_params = {
    'max_depth': 3,
    'n_estimators': 50,
    'max_features': 'sqrt'

}

In [9]:
rf_dev1 = random_forest_dev_v1(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=1,
                   max_evals=2,
                   search_space={})

In [10]:
%%time
rf1 = rf_dev1.run_model()



MLflow Run ID: 660b64727bb8424e8cc34f62a700727a
CPU times: user 7.39 s, sys: 163 ms, total: 7.55 s
Wall time: 9.07 s


In [11]:
rf1

(RandomForestRegressor(max_depth=3, max_features='sqrt', n_estimators=50),
 '660b64727bb8424e8cc34f62a700727a')

In [12]:
# Fetch the run details using MLflow API
rf1_id = rf1[1]
run = mlflow.get_run(rf1_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 172825.77862312406
RMSE Test: 172385.63985754168
R2 Train: 0.3537495181240784
R2 Test: 0.3541382666119016
Power Ratio Train: 0.9047367125219207
Power Ratio Test: 0.9027616293593101


## 3. HyperOpt (32 fits)

In [13]:
# Define the parameter space
search_space = {

    'max_depth': scope.int( hp.quniform('max_depth', 3,8,1) ),  # Max depth
    'n_estimators': scope.int( hp.quniform('n_estimators', 50,300,50) ),  # Number of tree estimators
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'oob_score': hp.choice('oob_score', [True, False])

}

In [14]:
rf_dev2 = random_forest_dev_v1(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=0,
                   max_evals=32,
                   search_space=search_space)

In [15]:
%%time
rf2 = rf_dev2.run_model()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [1:10:42<00:00, 132.57s/trial, best loss: 6979333496.120527]




MLflow Run ID: 5e88e0e7c5674615a1378e026281c93d
best param: {'max_depth': 8, 'n_estimators': 100, 'min_samples_split': 5, 'max_features': 3, 'oob_score': 1}
CPU times: user 1h 10min 22s, sys: 18.9 s, total: 1h 10min 41s
Wall time: 1h 11min


In [16]:
rf2

(RandomForestRegressor(max_depth=8, max_features=3, min_samples_split=5,
                       oob_score=1),
 '5e88e0e7c5674615a1378e026281c93d')

In [17]:
# Fetch the run details using MLflow API
run_id = rf2[1]
run = mlflow.get_run(run_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 172067.8623500112
RMSE Test: 173132.65554867193
R2 Train: 0.3594052687796587
R2 Test: 0.34852858573059664
Power Ratio Train: 0.885160673344262
Power Ratio Test: 0.8784209797566007


## 4. Save the best HyperOpt model

In [18]:
rf_model = rf2[0]
rf_model

RandomForestRegressor(max_depth=8, max_features=3, min_samples_split=5,
                      oob_score=1)

In [19]:
save_model(rf_model, "rf_model.pkl", "/Users/ben/Desktop/py_proj/account_funds_prediction/models")

Model saved to /Users/ben/Desktop/py_proj/account_funds_prediction/models/rf_model.pkl


In [20]:
# test reading the pkl file
model_test = load_model("rf_model.pkl", "/Users/ben/Desktop/py_proj/account_funds_prediction/models")
model_test

Model loaded from /Users/ben/Desktop/py_proj/account_funds_prediction/models/rf_model.pkl


RandomForestRegressor(max_depth=8, max_features=3, min_samples_split=5,
                      oob_score=1)