In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

from hyperopt import hp, tpe, fmin, Trials
from hyperopt.pyll.base import scope

import mlflow
import mlflow.lightgbm

import sys
import os
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/src')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/utils')

#from src.lightgbm_model_v3 import lightgbm_dev
from src.lightgbm_model import *
from utils.metrics import *
from utils.plots import *
from utils.model_io import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
sys.path

['/Library/Frameworks/Python.framework/Versions/3.9/lib/python39.zip',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/lib-dynload',
 '',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/venv1/lib/python3.9/site-packages',
 '/Users/ben/Desktop/py_proj/account_funds_prediction',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/src',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/utils']

In [3]:
os.getcwd()

'/Users/ben/Desktop/py_proj/account_funds_prediction/notebook'

## 1. Data prep

In [4]:
df = pd.read_csv('/Users/ben/Desktop/py_proj/account_funds_prediction/data/df.csv')

In [5]:
df.head()

Unnamed: 0,id,funds_now,funds_after_6months,age,tenure,credit_score,annual_income,mtg_balance,credit_card_balance,loan_balance,...,fea41,fea42,fea43,fea44,fea45,fea46,fea47,fea48,fea49,fea50
0,1,580888.6,411571.0,69,44,443,158191,211023.6,13320.2,11516.5,...,10232.7,26721.0,39983.1,45532.4,11885.7,1,1,1,0,1
1,2,372438.7,157944.4,32,43,496,208050,19841.3,45983.3,119858.9,...,-26726.7,-19164.4,22911.3,-9997.3,3621.1,1,0,0,0,0
2,3,645639.9,22235.5,89,55,527,148731,854416.8,41016.9,31300.6,...,-15244.7,4683.6,32100.5,-32710.4,39201.2,1,1,1,1,0
3,4,1191515.2,316512.4,78,1,628,263843,850936.3,2612.9,73886.1,...,18567.4,28572.1,-44307.5,-43039.7,156.0,1,1,1,1,0
4,5,348260.8,0.0,38,16,764,264430,979270.8,5583.3,252329.3,...,-15909.1,-33315.5,-32351.8,15200.4,-7349.3,0,0,1,1,0


In [6]:
df.shape

(200000, 52)

In [7]:
# train test split
X = df.drop(columns=['funds_after_6months', 'id'])
y = df['funds_after_6months']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## 2. Baseline Model

In [8]:
baseline_params = {
    'max_depth': 3,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.5
}

search_space = {
    # Integer parameters
    'max_depth': scope.int(hp.quniform('max_depth', 3,10,1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50,300,50)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 20, 100, 5)),  
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 5, 100, 5)),  

    # Decimal parameters
    'learning_rate': hp.uniform('learning_rate', 0.01, 1),  # Typical range for learning rate
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),  # Typical range for feature_fraction
    'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1.0),  # Typical range for bagging_fraction
    'lambda_l1': hp.uniform('lambda_l1', 0, 1),  # Typical range for lambda_l1 (L1 regularization)
}


In [9]:
lgb_dev1 = lightgbm_dev_v3(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=1,
                   max_evals=2,
                   search_space=search_space)

In [10]:
lgb_dev1

<src.lightgbm_model.lightgbm_dev_v3 at 0x17efd33d0>

In [11]:
lgb1 = lgb_dev1.run_model()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003358 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10335
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 50
[LightGBM] [Info] Start training from score 177372.881641




MLflow Run ID: ce3d0937fc8e47b6abcb93a645208a6b


In [12]:
lgb1

(LGBMRegressor(feature_fraction=0.5, max_depth=3, n_estimators=50),
 'ce3d0937fc8e47b6abcb93a645208a6b')

In [13]:
lgb1_id = lgb1[1]
lgb1_id

'ce3d0937fc8e47b6abcb93a645208a6b'

In [14]:
# Fetch the run details using MLflow API
run = mlflow.get_run(lgb1_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 89797.37195508531
RMSE Test: 88598.92600274534
R2 Train: 0.8255338855969169
R2 Test: 0.8293940009814278
Power Ratio Train: 0.9398507959736816
Power Ratio Test: 0.9390895402733672


## 3. HyperOpt (32 fits)

In [15]:
lgb_dev2 = lightgbm_dev_v3(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=0,
                   max_evals=32,
                   search_space=search_space)

In [16]:
%%time
lgb2 = lgb_dev2.run_model()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016976 seconds.                                                                                                                            
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10335                                                                                                                                                                                                 
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 50                                                                                                                                      
[LightGBM] [Info] Start training from score 177372.881641                                                                                                                                                                          
[LightGBM] [Info] Auto-choosin



MLflow Run ID: 66e92958f33f473dbe15bed75fa2ea87
best param: {'num_leaves': 65, 'max_depth': 10, 'n_estimators': 50, 'min_data_in_leaf': 95, 'learning_rate': 0.09075215255464458, 'feature_fraction': 0.8833215074339716, 'bagging_fraction': 0.9858297680175077, 'lambda_l1': 0.04629805689722438}
CPU times: user 5min 38s, sys: 1min 41s, total: 7min 19s
Wall time: 1min 7s




MLflow Run ID: 7acde082c46148dbb0f568f9e9350173
best param: {'num_leaves': 45, 'max_depth': 5, 'n_estimators': 300, 'min_data_in_leaf': 35, 'learning_rate': np.float64(0.730236549985059), 'feature_fraction': np.float64(0.8425270791814766), 'bagging_fraction': np.float64(0.5135028962808227), 'lambda_l1': np.float64(0.9721243016476777)}
CPU times: user 28 s, sys: 13.1 s, total: 41.1 s
Wall time: 11.8 s


In [17]:
lgb2

(LGBMRegressor(bagging_fraction=0.9858297680175077,
               feature_fraction=0.8833215074339716,
               lambda_l1=0.04629805689722438, learning_rate=0.09075215255464458,
               max_depth=10, min_data_in_leaf=95, n_estimators=50,
               num_leaves=65),
 '66e92958f33f473dbe15bed75fa2ea87')

In [18]:
# Fetch the run details using MLflow API
run_id = lgb2[1]
run = mlflow.get_run(run_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 84079.87378104505
RMSE Test: 85510.41028269759
R2 Train: 0.8470434998729751
R2 Test: 0.8410811682436314
Power Ratio Train: 0.9428457451925448
Power Ratio Test: 0.9404949647746761


## 4. Save the best HyperOpt model

In [19]:
lgb_model = lgb2[0]
lgb_model

LGBMRegressor(bagging_fraction=0.9858297680175077,
              feature_fraction=0.8833215074339716,
              lambda_l1=0.04629805689722438, learning_rate=0.09075215255464458,
              max_depth=10, min_data_in_leaf=95, n_estimators=50,
              num_leaves=65)

In [20]:
save_model(lgb_model, "lgb_model_v2.pkl", "/Users/ben/Desktop/py_proj/account_funds_prediction/models")

Model saved to /Users/ben/Desktop/py_proj/account_funds_prediction/models/lgb_model_v2.pkl


In [21]:
# test reading the pkl file
model_test = load_model("lgb_model_v2.pkl", "/Users/ben/Desktop/py_proj/account_funds_prediction/models")

Model loaded from /Users/ben/Desktop/py_proj/account_funds_prediction/models/lgb_model_v2.pkl


In [22]:
model_test

LGBMRegressor(bagging_fraction=0.9858297680175077,
              feature_fraction=0.8833215074339716,
              lambda_l1=0.04629805689722438, learning_rate=0.09075215255464458,
              max_depth=10, min_data_in_leaf=95, n_estimators=50,
              num_leaves=65)