In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

from hyperopt import hp, tpe, fmin, Trials
from hyperopt.pyll.base import scope

import os
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/src')
from lightgbm_model_v2 import lightgbm_dev

In [2]:
sys.path

['/opt/anaconda3/envs/myenv1/lib/python310.zip',
 '/opt/anaconda3/envs/myenv1/lib/python3.10',
 '/opt/anaconda3/envs/myenv1/lib/python3.10/lib-dynload',
 '',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/venv1/lib/python3.10/site-packages',
 '/Users/ben/Desktop/py_proj/account_funds_prediction/src']

In [3]:
os.getcwd()

'/Users/ben/Desktop/py_proj/account_funds_prediction/notebook'

## 1. Data prep

In [4]:
df = pd.read_csv('/Users/ben/Desktop/py_proj/account_funds_prediction/data/df_v1')

In [5]:
df.head()

Unnamed: 0,bal_future,bal,age,tenure,credit_score,annual_income,mtg_balance,credit_card_balance,col8,col9,...,col11,col12,col13,col14,col15,col16,col17,col18,col19,col20
0,121958,245279,87,25,781,172012,960679,3446,777,586,...,211,442,62,345,498,517,89,360,902,78
1,671155,178356,65,31,773,143327,153601,11009,248,194,...,53,946,387,843,221,424,26,940,249,383
2,131932,752233,58,13,302,161961,441341,41415,272,334,...,954,37,376,843,583,89,450,337,709,404
3,365838,895983,38,34,497,152383,554472,16728,174,722,...,96,151,402,808,198,354,452,16,648,49
4,259178,63724,82,32,786,106959,748662,1960,8,559,...,276,793,994,592,36,272,89,503,925,999


In [6]:
X = df.drop(columns=['bal'])
y = df['bal_future']

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## 2. Model Training

### 2.1 Baseline model

In [8]:
baseline_params = {
    'max_depth': 3,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.5
}

search_space = {
    # Integer parameters
    'max_depth': scope.int(hp.quniform('max_depth', 3,10,1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50,300,50)),
        'num_leaves': scope.int(hp.quniform('num_leaves', 20, 100, 5)),  
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 5, 100, 5)),  

    # Decimal parameters
    'learning_rate': hp.uniform('learning_rate', 0.01, 1),  # Typical range for learning rate
    'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),  # Typical range for feature_fraction
    'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 1.0),  # Typical range for bagging_fraction
    'lambda_l1': hp.uniform('lambda_l1', 0, 1),  # Typical range for lambda_l1 (L1 regularization)
}


In [9]:
lgb_dev = lightgbm_dev(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=1,
                   max_evals=2,
                   search_space=search_space)

In [10]:
lgb1 = lgb_dev.run_model()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4713
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20
[LightGBM] [Info] Start training from score 500775.555514


In [11]:
# Predict on the train and test set
y_train_pred = lgb1.predict(X_train)
y_test_pred = lgb1.predict(X_test)

# Calculate RMSE for training and test sets
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate R^2 for training and test sets
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Output the results
print(f"Training RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Training R^2: {train_r2}")
print(f"Test R^2: {test_r2}")

Training RMSE: 30129.585462629297
Test RMSE: 30185.957005566383
Training R^2: 0.989155310883988
Test R^2: 0.9890146038296852


### 2.2 HyperOpt (2 fits)

In [12]:
lgb_dev2 = lightgbm_dev(X_train=X_train, y_train=y_train, 
                   X_test=X_test, y_test=y_test,
                   baseline_params = baseline_params,
                   baseline_ind=0,
                   max_evals=2,
                   search_space=search_space)

In [13]:
lgb2 = lgb_dev2.run_model()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002882 seconds.                                       
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713                                                                                                             
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20                                                  
[LightGBM] [Info] Start training from score 500775.555514                                                                                     


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003055 seconds.                                       
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713                                                                                                             
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20                                                  
[LightGBM] [Info] Start training from score 500775.555514                                                                                     




100%|█████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.21trial/s, best loss: 1575018.6466337298]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4713
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 20
[LightGBM] [Info] Start training from score 500775.555514
best param: {'num_leaves': 80, 'max_depth': 7, 'n_estimators': 50, 'min_data_in_leaf': 50, 'learning_rate': np.float64(0.8134418050419463), 'feature_fraction': np.float64(0.9206799392620909), 'bagging_fraction': np.float64(0.8093672549102126), 'lambda_l1': np.float64(0.20792170743318383)}


In [14]:
# Predict on the train and test set
y_train_pred = lgb2.predict(X_train)
y_test_pred = lgb2.predict(X_test)

# Calculate RMSE for training and test sets
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Calculate R^2 for training and test sets
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Output the results
print(f"Training RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Training R^2: {train_r2}")
print(f"Test R^2: {test_r2}")

Training RMSE: 1044.8653889576362
Test RMSE: 1254.9974687758258
Training R^2: 0.9999869577812355
Test R^2: 0.999981011462126
