# Finance Scoring task with tabular data

## 0. Import

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

from lightgbm import LGBMClassifier
from finance_scoring.utils import load_config

## 1. Configuration Setting

In [4]:
config = load_config()
DATA_DIR = config.data_dir

## 2. Load Data

In [5]:
train = pd.read_csv(f"{DATA_DIR}/{config.train}", index_col=0)
valid = pd.read_csv(f"{DATA_DIR}/{config.valid}", index_col=1).drop("Unnamed: 0", axis=1)
valid.index.name = None

In [7]:
X_train, X_dev, y_train, y_dev = train_test_split(
    train.drop("label", axis=1), train["label"], stratify=train["label"]
)

## 3. Preprocess

In [None]:
std_cols = [ # Number of trades
    "ExternalRiskEstimate",
    "NumSatisfactoryTrades",
    "NumTrades60Ever2DerogPubRec",
    "NumTrades90Ever2DerogPubRec",
]

minmax_cols = [ # Months
    "MSinceOldestTradeOpen",
    "MSinceMostRecentTradeOpen",
    "AverageMInFile"
]

In [16]:
X_train

Unnamed: 0,AverageMInFile,ExternalRiskEstimate,MSinceMostRecentDelq,MSinceMostRecentInqexcl7days,MSinceMostRecentTradeOpen,MSinceOldestTradeOpen,MaxDelq2PublicRecLast12M,MaxDelqEver,NetFractionInstallBurden,NetFractionRevolvingBurden,...,NumInstallTradesWBalance,NumRevolvingTradesWBalance,NumSatisfactoryTrades,NumTotalTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,NumTradesOpeninLast12M,PercentInstallTrades,PercentTradesNeverDelq,PercentTradesWBalance
250,96,85,-7,0,8,271,7,8,-8,4,...,1,3,26,26,0,0,1,19,100,25
7064,59,79,-7,2,11,150,7,8,77,8,...,3,2,27,27,0,0,1,33,100,50
728,15,61,-7,2,2,35,7,8,81,0,...,7,0,15,17,0,0,8,71,100,75
996,117,90,-7,-8,29,330,7,8,-8,1,...,1,1,20,20,0,0,0,20,100,22
2387,134,70,46,-7,3,396,6,6,39,59,...,2,6,33,45,0,0,2,26,91,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10281,51,62,10,7,4,123,3,5,93,97,...,4,2,13,16,2,0,2,40,80,100
9212,133,88,-7,7,11,159,7,8,-8,0,...,1,1,15,18,0,0,1,50,100,40
2499,133,82,-7,-7,17,311,7,8,-8,34,...,2,4,18,21,0,0,0,22,100,60
4322,77,71,-7,0,6,268,0,8,-8,3,...,1,1,14,15,1,1,2,50,100,50


In [22]:
X_train.describe()

Unnamed: 0,AverageMInFile,ExternalRiskEstimate,MSinceMostRecentDelq,MSinceMostRecentInqexcl7days,MSinceMostRecentTradeOpen,MSinceOldestTradeOpen,MaxDelq2PublicRecLast12M,MaxDelqEver,NetFractionInstallBurden,NetFractionRevolvingBurden,...,NumInstallTradesWBalance,NumRevolvingTradesWBalance,NumSatisfactoryTrades,NumTotalTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,NumTradesOpeninLast12M,PercentInstallTrades,PercentTradesNeverDelq,PercentTradesWBalance
count,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,...,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0,7059.0
mean,73.894319,67.491288,6.766964,-0.230628,8.61156,184.984134,4.948718,5.53492,39.274401,31.885536,...,0.973792,3.172546,19.410256,20.901686,0.059782,-0.124947,1.26151,32.266043,86.753364,62.323559
std,38.870423,20.900946,20.390755,6.149393,13.046807,109.977023,3.717752,3.929244,42.32643,30.083283,...,4.029119,4.357076,12.912768,14.47775,2.493254,2.33597,3.034496,20.115543,25.788384,27.612494
min,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0
25%,52.0,63.0,-7.0,-7.0,3.0,118.0,4.0,5.0,-8.0,6.0,...,1.0,2.0,12.0,12.0,0.0,0.0,0.0,20.0,87.0,48.0
50%,74.0,71.0,-7.0,0.0,6.0,178.0,6.0,6.0,48.0,26.0,...,2.0,3.0,19.0,20.0,0.0,0.0,1.0,32.0,96.0,67.0
75%,95.0,79.0,14.0,1.0,11.0,250.0,7.0,8.0,79.0,54.0,...,3.0,5.0,27.0,29.0,1.0,0.0,3.0,44.0,100.0,83.0
max,322.0,94.0,83.0,24.0,227.0,803.0,9.0,8.0,471.0,154.0,...,23.0,27.0,74.0,87.0,16.0,16.0,17.0,100.0,100.0,100.0


## 4. Modeling

### 4.1 Tree-based Classifiers and TPOT

In [44]:
clf = LGBMClassifier()
clf.fit(X_train, y_train)

print(classification_report(y_dev, clf.predict(X_dev)))

              precision    recall  f1-score   support

           0       0.70      0.78      0.74      1228
           1       0.72      0.64      0.68      1126

    accuracy                           0.71      2354
   macro avg       0.71      0.71      0.71      2354
weighted avg       0.71      0.71      0.71      2354



In [45]:
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(X_train, y_train)
# export the best model
model.export('tpot_best_model.py')


Generation 1 - Current best internal CV score: 0.7264004580796816

Generation 2 - Current best internal CV score: 0.7271551510524448

Generation 3 - Current best internal CV score: 0.7284780905310108

Generation 4 - Current best internal CV score: 0.7284780905310108

Generation 5 - Current best internal CV score: 0.7284780905310108

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.5, max_depth=1, min_child_weight=20, n_estimators=100, n_jobs=1, subsample=0.9000000000000001, verbosity=0)


In [46]:
model.score(X_dev, y_dev)

0.7153780798640612

In [13]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.5.0-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = train
features = tpot_data.drop('label', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['label'], random_state=1)

# Average CV score on the training set was: 0.7284780905310108
exported_pipeline = XGBClassifier(learning_rate=0.5, max_depth=1, min_child_weight=20, n_estimators=100, n_jobs=1, subsample=0.9000000000000001, verbosity=0)
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 1)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)



In [17]:
print(classification_report(testing_target, results))

              precision    recall  f1-score   support

           0       0.70      0.77      0.73      1227
           1       0.72      0.63      0.67      1127

    accuracy                           0.71      2354
   macro avg       0.71      0.70      0.70      2354
weighted avg       0.71      0.71      0.70      2354



In [19]:
f1_score(testing_target, results)

0.672022684310019

### 4.2 TabNet

In [1]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Using cached pytorch_tabnet-3.1.1-py3-none-any.whl (39 kB)
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [9]:
from pytorch_tabnet.tab_model import TabNetClassifier

clf = TabNetClassifier()  #TabNetRegressor()
clf.fit(
  X_train.to_numpy(), y_train.to_numpy(),
  eval_set=[(X_dev.to_numpy(), y_dev.to_numpy())]
)
# preds = clf.predict(X_test)

Device used : cuda
epoch 0  | loss: 0.78871 | val_0_auc: 0.63463 |  0:00:03s
epoch 1  | loss: 0.70004 | val_0_auc: 0.61222 |  0:00:04s
epoch 2  | loss: 0.64513 | val_0_auc: 0.57825 |  0:00:04s
epoch 3  | loss: 0.61954 | val_0_auc: 0.56277 |  0:00:05s
epoch 4  | loss: 0.59963 | val_0_auc: 0.53834 |  0:00:05s
epoch 5  | loss: 0.58899 | val_0_auc: 0.54949 |  0:00:05s
epoch 6  | loss: 0.5784  | val_0_auc: 0.5464  |  0:00:06s
epoch 7  | loss: 0.57358 | val_0_auc: 0.5428  |  0:00:06s
epoch 8  | loss: 0.56881 | val_0_auc: 0.57216 |  0:00:06s
epoch 9  | loss: 0.56724 | val_0_auc: 0.57443 |  0:00:07s
epoch 10 | loss: 0.56416 | val_0_auc: 0.58277 |  0:00:07s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_auc = 0.63463
Best weights from best epoch are automatically used!


In [12]:
preds

NameError: name 'preds' is not defined

In [15]:
preds = clf.predict(X_train.to_numpy())
f1_score(y_train.to_numpy(), preds)

0.6494349603270018