In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

from hyperopt import hp, tpe, fmin, Trials
from hyperopt.pyll.base import scope

import mlflow
import mlflow.lightgbm

import sys
import os
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/src')

#from src.lightgbm_model_v3 import lightgbm_dev
from src.lightgbm_model import lightgbm_dev_v2
from utils.metrics import PowerRatio

import warnings
warnings.filterwarnings("ignore")

In [2]:
sys.path

['/Library/Frameworks/Python.framework/Versions/3.9/lib/python39.zip',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/lib-dynload',
 '',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/venv1/lib/python3.9/site-packages',
 '/Users/ben/Desktop/py_proj/account_funds_prediction',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/src']

In [3]:
os.getcwd()

'/Users/ben/Desktop/py_proj/account_funds_prediction/notebook'

## 1. Data prep

In [4]:
df = pd.read_csv('/Users/ben/Desktop/py_proj/account_funds_prediction/data/df_v1')

In [5]:
df.head()

Unnamed: 0,bal_future,bal,age,tenure,credit_score,annual_income,mtg_balance,credit_card_balance,col8,col9,...,col11,col12,col13,col14,col15,col16,col17,col18,col19,col20
0,121958,245279,87,25,781,172012,960679,3446,777,586,...,211,442,62,345,498,517,89,360,902,78
1,671155,178356,65,31,773,143327,153601,11009,248,194,...,53,946,387,843,221,424,26,940,249,383
2,131932,752233,58,13,302,161961,441341,41415,272,334,...,954,37,376,843,583,89,450,337,709,404
3,365838,895983,38,34,497,152383,554472,16728,174,722,...,96,151,402,808,198,354,452,16,648,49
4,259178,63724,82,32,786,106959,748662,1960,8,559,...,276,793,994,592,36,272,89,503,925,999


In [6]:
X = df.drop(columns=['bal'])
y = df['bal_future']

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## 2. Model Training

### 2.1 Baseline model

In [8]:
baseline_params = {
    'max_depth': 3,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.5
}

search_space = {
    # Integer parameters
    'max_depth': scope.int(hp.quniform('max_depth', 3,10,1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50,300,50)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 20, 100, 5)),  
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 5, 100, 5)),  

    # Decimal parameters
    'learning_rate': hp.uniform('learning_rate', 0.01, 1),  # Typical range for learning rate
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),  # Typical range for feature_fraction
    'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1.0),  # Typical range for bagging_fraction
    'lambda_l1': hp.uniform('lambda_l1', 0, 1),  # Typical range for lambda_l1 (L1 regularization)
}


In [9]:
lgb_dev1 = lightgbm_dev_v2(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=1,
                   max_evals=2,
                   search_space=search_space)

In [10]:
lgb_dev1

<src.lightgbm_model.lightgbm_dev_v2 at 0x28281e850>

In [11]:
lgb1 = lgb_dev1.run_model()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20
[LightGBM] [Info] Start training from score 500775.555514




MLflow Run ID: 7d775ab8e22747d58f24ae06db4803da


In [12]:
lgb1

(LGBMRegressor(feature_fraction=0.5, max_depth=3, n_estimators=50),
 '7d775ab8e22747d58f24ae06db4803da')

In [13]:
lgb1_id = lgb1[1]
lgb1_id

'7d775ab8e22747d58f24ae06db4803da'

In [14]:
# Fetch the run details using MLflow API
run = mlflow.get_run(lgb1_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 30129.585462629297
RMSE Test: 30185.957005566383
R2 Train: 0.989155310883988
R2 Test: 0.9890146038296852
Power Ratio Train: 0.9999343661175427
Power Ratio Test: 0.9999347809017975


### 2.2 HyperOpt (50 fits)

In [15]:
lgb_dev2 = lightgbm_dev_v2(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=0,
                   max_evals=50,
                   search_space=search_space)

In [16]:
%%time
lgb2 = lgb_dev2.run_model()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002402 seconds.                                         
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713                                                                                                               
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20                                                    
[LightGBM] [Info] Start training from score 500775.555514                                                                                       
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002803 seconds.                                         
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713                                                                                                               
[LightGBM] [In



MLflow Run ID: 3009e3e254784189bc5f6a38b71a7976
best param: {'num_leaves': 90, 'max_depth': 8, 'n_estimators': 100, 'min_data_in_leaf': 60, 'learning_rate': 0.32103084889981287, 'feature_fraction': 0.9292651688999145, 'bagging_fraction': 0.8360736970562084, 'lambda_l1': 0.15264476604938765}
CPU times: user 3min 44s, sys: 1min 38s, total: 5min 22s
Wall time: 1min 3s








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002122 seconds.                                                             
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713                                                                                                                                   
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20                                                                        
[LightGBM] [Info] Start training from score 500775.555514                                                                                                           








 50%|████████████████████████████████████████████████                                                | 1/2 [00:05<00:04,  4.21s/trial, best loss: 3078426.992595184]




100%|████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.18s/trial, best loss: 3078426.992595184]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20
[LightGBM] [Info] Start training from score 500775.555514




MLflow Run ID: 7acde082c46148dbb0f568f9e9350173
best param: {'num_leaves': 45, 'max_depth': 5, 'n_estimators': 300, 'min_data_in_leaf': 35, 'learning_rate': np.float64(0.730236549985059), 'feature_fraction': np.float64(0.8425270791814766), 'bagging_fraction': np.float64(0.5135028962808227), 'lambda_l1': np.float64(0.9721243016476777)}
CPU times: user 28 s, sys: 13.1 s, total: 41.1 s
Wall time: 11.8 s


In [17]:
lgb2

(LGBMRegressor(bagging_fraction=0.8360736970562084,
               feature_fraction=0.9292651688999145,
               lambda_l1=0.15264476604938765, learning_rate=0.32103084889981287,
               max_depth=8, min_data_in_leaf=60, num_leaves=90),
 '3009e3e254784189bc5f6a38b71a7976')

In [18]:
# Fetch the run details using MLflow API
run_id = lgb2[1]
run = mlflow.get_run(run_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 954.1652535102803
RMSE Test: 1184.8975819961552
R2 Train: 0.9999891237798805
R2 Test: 0.9999830734888431
Power Ratio Train: 0.9999973807248577
Power Ratio Test: 0.9999922032231844
