In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#import xgboost as xgb
#import lightgbm as lgb

from hyperopt import hp, tpe, fmin, Trials
from hyperopt.pyll.base import scope

import mlflow
#import mlflow.xgboost
#import mlflow.lightgbm

import sys
import os
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/src')
sys.path.append('/Users/ben/Desktop/py_proj/account_funds_prediction/utils')

from src.lightgbm_model import *
from src.xgboost_model import *
from src.linear_regression_model import *

from utils.metrics import *
from utils.plots import *
from utils.model_io import *

import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm

## 1. Data prep

In [2]:
df = pd.read_csv('/Users/ben/Desktop/py_proj/account_funds_prediction/data/df.csv')

In [3]:
df.head()

Unnamed: 0,id,funds_now,funds_after_6months,age,tenure,credit_score,annual_income,mtg_balance,credit_card_balance,loan_balance,...,fea41,fea42,fea43,fea44,fea45,fea46,fea47,fea48,fea49,fea50
0,1,580888.6,411571.0,69,44,443,158191,211023.6,13320.2,11516.5,...,10232.7,26721.0,39983.1,45532.4,11885.7,1,1,1,0,1
1,2,372438.7,157944.4,32,43,496,208050,19841.3,45983.3,119858.9,...,-26726.7,-19164.4,22911.3,-9997.3,3621.1,1,0,0,0,0
2,3,645639.9,22235.5,89,55,527,148731,854416.8,41016.9,31300.6,...,-15244.7,4683.6,32100.5,-32710.4,39201.2,1,1,1,1,0
3,4,1191515.2,316512.4,78,1,628,263843,850936.3,2612.9,73886.1,...,18567.4,28572.1,-44307.5,-43039.7,156.0,1,1,1,1,0
4,5,348260.8,0.0,38,16,764,264430,979270.8,5583.3,252329.3,...,-15909.1,-33315.5,-32351.8,15200.4,-7349.3,0,0,1,1,0


In [4]:
X = df.drop(columns=['funds_after_6months', 'id'])
y = df['funds_after_6months']

In [5]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## 2. Baseline model

In [6]:
lr_dev1 = linear_regression_v2(X_train=X_train, y_train=y_train, 
                               X_test=X_test, y_test=y_test,
                                model_type = 'baseline', significance_level=0.05)

In [7]:
lr1 = lr_dev1.run_model()

In [8]:
# predict train
X_train_with_intercept = sm.add_constant(X_train)
y_train_pred = lr1.predict(X_train_with_intercept)

# predict test
X_test_with_intercept = sm.add_constant(X_test)  
y_test_pred = lr1.predict(X_test_with_intercept)

In [9]:
## RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f" Training RMSE: {rmse_train:.1f}, Test RMSE: {rmse_test:.1f} ")

 Training RMSE: 85655.1, Test RMSE: 85307.0 


In [10]:
## R2
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f" Training R2: {r2_train:.3f}, Test R2: {r2_test:.3f} ")

 Training R2: 0.841, Test R2: 0.842 


In [11]:
# power ratio
pr_train = PowerRatio(y_train_pred, y_train)
pr_test = PowerRatio(y_test_pred, y_test)

print(f" Training Power Ratio: {pr_train:.3f}, Test Power Ratio: {pr_test:.3f}")

 Training Power Ratio: 0.941, Test Power Ratio: 0.940


## 3. Backward selection

In [12]:
lr_dev2 = linear_regression_v2(X_train=X_train, y_train=y_train, 
                               X_test=X_test, y_test=y_test,
                                model_type = 'backward', significance_level=0.05)

In [13]:
%%time
lr2 = lr_dev2.run_model()

CPU times: user 38.9 s, sys: 14 s, total: 52.8 s
Wall time: 6.83 s


In [14]:
lr2_model = lr2[0]
lr2_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x107828bb0>

In [15]:
X_selected_backward = lr2[1]
X_selected_backward.head()

Unnamed: 0,const,funds_now,annual_income,mtg_balance,credit_card_balance,loan_balance,splc_balance,inflow,outflow,fea20,fea29,fea35
21269,1.0,811385.4,115474,226504.8,27880.9,34313.1,509701.4,52375.0,17607.8,74532.4,119813.5,-42397.8
187660,1.0,193423.4,245395,408844.8,40916.5,172262.6,562633.0,117076.6,5692.0,148676.2,7825.4,-35958.7
774,1.0,634050.2,59336,362281.7,1832.4,284187.4,479036.7,43909.2,122704.5,134178.7,120643.9,32801.9
184577,1.0,1748997.0,254866,612043.4,3604.5,100824.7,776705.9,178830.4,40894.1,136640.6,144138.3,-42864.8
37127,1.0,480552.0,160373,161387.3,35483.5,9448.0,646735.7,34353.3,15958.4,156185.7,79976.8,-11909.6


In [16]:
# predict train
y_train_pred = lr2_model.predict(X_selected_backward)

# predict test 
X_test_with_intercept = sm.add_constant(X_test)
y_test_pred = lr2_model.predict(X_test_with_intercept[X_selected_backward.columns])

In [17]:
## RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f" Training RMSE: {rmse_train:.1f}, Test RMSE: {rmse_test:.1f} ")

 Training RMSE: 85664.9, Test RMSE: 85301.3 


In [18]:
## R2
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f" Training R2: {r2_train:.3f}, Test R2: {r2_test:.3f} ")

 Training R2: 0.841, Test R2: 0.842 


In [19]:
# power ratio
pr_train = PowerRatio(y_train_pred, y_train)
pr_test = PowerRatio(y_test_pred, y_test)

print(f" Training Power Ratio: {pr_train:.3f}, Test Power Ratio: {pr_test:.3f}")

 Training Power Ratio: 0.941, Test Power Ratio: 0.940


## 4. Forward selection

In [20]:
lr_dev3 = linear_regression_v2(X_train=X_train, y_train=y_train, 
                               X_test=X_test, y_test=y_test,
                                model_type = 'forward', significance_level=0.05)

In [21]:
lr3 = lr_dev3.run_model()

In [22]:
lr3_model = lr3[0]
lr3_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x17f7501f0>

In [23]:
X_selected_forward = lr3[1]
X_selected_forward.head()

Unnamed: 0,const,funds_now,annual_income,mtg_balance,loan_balance,splc_balance,inflow,outflow,credit_card_balance,fea20,fea29,fea35
21269,1.0,811385.4,115474,226504.8,34313.1,509701.4,52375.0,17607.8,27880.9,74532.4,119813.5,-42397.8
187660,1.0,193423.4,245395,408844.8,172262.6,562633.0,117076.6,5692.0,40916.5,148676.2,7825.4,-35958.7
774,1.0,634050.2,59336,362281.7,284187.4,479036.7,43909.2,122704.5,1832.4,134178.7,120643.9,32801.9
184577,1.0,1748997.0,254866,612043.4,100824.7,776705.9,178830.4,40894.1,3604.5,136640.6,144138.3,-42864.8
37127,1.0,480552.0,160373,161387.3,9448.0,646735.7,34353.3,15958.4,35483.5,156185.7,79976.8,-11909.6


In [24]:
# predict train
y_train_pred = lr3_model.predict(X_selected_forward)

# predict test 
X_test_with_intercept = sm.add_constant(X_test)
y_test_pred = lr3_model.predict(X_test_with_intercept[X_selected_forward.columns])

In [25]:
## RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f" Training RMSE: {rmse_train:.1f}, Test RMSE: {rmse_test:.1f} ")

 Training RMSE: 85664.9, Test RMSE: 85301.3 


In [26]:
## R2
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f" Training R2: {r2_train:.3f}, Test R2: {r2_test:.3f} ")

 Training R2: 0.841, Test R2: 0.842 


In [27]:
# power ratio
pr_train = PowerRatio(y_train_pred, y_train)
pr_test = PowerRatio(y_test_pred, y_test)

print(f" Training Power Ratio: {pr_train:.3f}, Test Power Ratio: {pr_test:.3f}")

 Training Power Ratio: 0.941, Test Power Ratio: 0.940


* The backward selection and forward selection produce the same model (same key features), we will save the backward model.

## 5. Save the best model

In [28]:
lr_model = lr2[0]
lr_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x107828bb0>

In [29]:
save_model(lr_model, "lr_model.pkl", "/Users/ben/Desktop/py_proj/account_funds_prediction/models")

Model saved to /Users/ben/Desktop/py_proj/account_funds_prediction/models/lr_model.pkl


In [30]:
# test reading the pkl file
model_test = load_model("lr_model.pkl", "/Users/ben/Desktop/py_proj/account_funds_prediction/models")

Model loaded from /Users/ben/Desktop/py_proj/account_funds_prediction/models/lr_model.pkl


In [31]:
model_test

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x17f770d30>