In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

from hyperopt import hp, tpe, fmin, Trials
from hyperopt.pyll.base import scope

import mlflow
import mlflow.lightgbm

import sys
import os
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/src')

#from src.lightgbm_model_v3 import lightgbm_dev
from src.lightgbm_model import lightgbm_dev_v2
from utils.metrics import PowerRatio

import warnings
warnings.filterwarnings("ignore")

In [2]:
sys.path

['/Library/Frameworks/Python.framework/Versions/3.9/lib/python39.zip',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9',
 '/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/lib-dynload',
 '',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/venv1/lib/python3.9/site-packages',
 '/Users/ben/Desktop/py_proj/account_funds_prediction',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/src']

In [3]:
os.getcwd()

'/Users/ben/Desktop/py_proj/account_funds_prediction/notebook'

## 1. Data prep

In [4]:
df = pd.read_csv('/Users/ben/Desktop/py_proj/account_funds_prediction/data/df_v2')

In [5]:
df.head()

Unnamed: 0,bal_now,age,tenure,credit_score,annual_income,inflow,outflow,mtg_balance,credit_card_balance,loan_balance,...,fea27,fea28,fea29,fea30,fea31,fea32,fea33,fea34,fea35,bal_after_6mon
0,6423388,27,24,837,191218,19428,1685,758311,15470,68750,...,48203,40848,17735,92914,7025,38486,43083,14451,68160,6320044.0
1,6550634,22,43,585,239710,5914,7998,813964,7092,458988,...,13815,4393,90459,8199,21066,29089,46831,4332,24124,6418258.0
2,4304572,47,25,616,127094,9698,8166,825144,33306,168257,...,22362,12895,48617,64084,20536,61102,10177,88745,53813,4173464.0
3,2234489,65,2,497,79674,17023,2248,74580,4987,64865,...,18514,79026,51272,6730,71238,74501,57498,45620,55350,2262496.0
4,9958614,38,34,623,132976,810,4576,7523,1726,252167,...,55122,98179,49028,55157,51188,52524,75274,18425,83985,9996288.0


In [7]:
X = df.drop(columns=['bal_after_6mon'])
y = df['bal_after_6mon']

In [8]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## 2. Model Training

### 2.1 Baseline model

In [9]:
baseline_params = {
    'max_depth': 3,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.5
}

search_space = {
    # Integer parameters
    'max_depth': scope.int(hp.quniform('max_depth', 3,10,1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50,300,50)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 20, 100, 5)),  
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 5, 100, 5)),  

    # Decimal parameters
    'learning_rate': hp.uniform('learning_rate', 0.01, 1),  # Typical range for learning rate
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),  # Typical range for feature_fraction
    'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1.0),  # Typical range for bagging_fraction
    'lambda_l1': hp.uniform('lambda_l1', 0, 1),  # Typical range for lambda_l1 (L1 regularization)
}


In [10]:
lgb_dev1 = lightgbm_dev_v2(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=1,
                   max_evals=2,
                   search_space=search_space)

In [11]:
lgb_dev1

<src.lightgbm_model.lightgbm_dev_v2 at 0x289a49d30>

In [12]:
lgb1 = lgb_dev1.run_model()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11592
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 50
[LightGBM] [Info] Start training from score 4906522.606439




MLflow Run ID: a59da3af62d547a89f67c9554ea72175


In [13]:
lgb1

(LGBMRegressor(feature_fraction=0.5, max_depth=3, n_estimators=50),
 'a59da3af62d547a89f67c9554ea72175')

In [14]:
lgb1_id = lgb1[1]
lgb1_id

'a59da3af62d547a89f67c9554ea72175'

In [15]:
# Fetch the run details using MLflow API
run = mlflow.get_run(lgb1_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 270882.6128560889
RMSE Test: 272238.4425419598
R2 Train: 0.9911632315521512
R2 Test: 0.9910722366311477
Power Ratio Train: 0.9999533628349826
Power Ratio Test: 0.9999527159450317


### 2.2 HyperOpt (50 fits)

In [16]:
lgb_dev2 = lightgbm_dev_v2(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=0,
                   max_evals=50,
                   search_space=search_space)

In [17]:
%%time
lgb2 = lgb_dev2.run_model()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011929 seconds.                                         
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11592                                                                                                              
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 50                                                   
[LightGBM] [Info] Start training from score 4906522.606439                                                                                      
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013511 seconds.                                         
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11592                                                                                                              
[LightGBM] [In



MLflow Run ID: 829458b387ef42b4888a51822580f459
best param: {'num_leaves': 20, 'max_depth': 10, 'n_estimators': 300, 'min_data_in_leaf': 100, 'learning_rate': 0.23343967842553467, 'feature_fraction': 0.8830460359876731, 'bagging_fraction': 0.43060883678575035, 'lambda_l1': 0.7435503912310693}
CPU times: user 11min 55s, sys: 2min 43s, total: 14min 38s
Wall time: 2min 10s




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002122 seconds.                                                             
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713                                                                                                                                   
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20                                                                        
[LightGBM] [Info] Start training from score 500775.555514                                                                                                           








 50%|████████████████████████████████████████████████                                                | 1/2 [00:05<00:04,  4.21s/trial, best loss: 3078426.992595184]




100%|████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.18s/trial, best loss: 3078426.992595184]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20
[LightGBM] [Info] Start training from score 500775.555514




MLflow Run ID: 7acde082c46148dbb0f568f9e9350173
best param: {'num_leaves': 45, 'max_depth': 5, 'n_estimators': 300, 'min_data_in_leaf': 35, 'learning_rate': np.float64(0.730236549985059), 'feature_fraction': np.float64(0.8425270791814766), 'bagging_fraction': np.float64(0.5135028962808227), 'lambda_l1': np.float64(0.9721243016476777)}
CPU times: user 28 s, sys: 13.1 s, total: 41.1 s
Wall time: 11.8 s


In [18]:
lgb2

(LGBMRegressor(bagging_fraction=0.43060883678575035,
               feature_fraction=0.8830460359876731, lambda_l1=0.7435503912310693,
               learning_rate=0.23343967842553467, max_depth=10,
               min_data_in_leaf=100, n_estimators=300, num_leaves=20),
 '829458b387ef42b4888a51822580f459')

In [20]:
# Fetch the run details using MLflow API
run_id = lgb2[1]
run = mlflow.get_run(run_id)

# Retrieve the metrics from the run
rmse_train = run.data.metrics.get("rmse_train", None)
rmse_test = run.data.metrics.get("rmse_test", None)
r2_train = run.data.metrics.get("r2_train", None)
r2_test = run.data.metrics.get("r2_test", None)
pr_train = run.data.metrics.get("powerratio_train", None)
pr_test = run.data.metrics.get("powerratio_test", None)

# Print the retrieved metrics
print(f"RMSE Train: {rmse_train}")
print(f"RMSE Test: {rmse_test}")
print(f"R2 Train: {r2_train}")
print(f"R2 Test: {r2_test}")
print(f"Power Ratio Train: {pr_train}")
print(f"Power Ratio Test: {pr_test}")

RMSE Train: 12113.402091569065
RMSE Test: 12812.746794690169
R2 Train: 0.9999823289041604
R2 Test: 0.9999802244643838
Power Ratio Train: 0.9999956238937182
Power Ratio Test: 0.9999951161887085
