# Import

In [None]:
import sys

import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score

root_dir = '../../'
if root_dir not in sys.path:
    sys.path.append(root_dir)
import modules

# Read data

In [2]:
train, test, submit = modules.utils.get_data(debug_mode=True)

# Preprocessing

In [3]:
train[['DisbursementGross',  'GrAppv', 'SBA_Appv']]

Unnamed: 0,DisbursementGross,GrAppv,SBA_Appv
0,"$80,000.00","$80,000.00","$68,000.00"
1,"$287,000.00","$287,000.00","$229,600.00"
2,"$31,983.00","$30,000.00","$15,000.00"
3,"$229,000.00","$229,000.00","$229,000.00"
4,"$525,000.00","$525,000.00","$393,750.00"
...,...,...,...
995,"$196,000.00","$196,000.00","$196,000.00"
996,"$25,000.00","$25,000.00","$21,250.00"
997,"$169,910.00","$51,500.00","$25,750.00"
998,"$275,000.00","$275,000.00","$206,250.00"


In [3]:
# ドルマークの削除
train[['DisbursementGross',  'GrAppv', 'SBA_Appv']] = train[['DisbursementGross',  'GrAppv', 'SBA_Appv']].applymap(lambda x: x.strip().replace('$', '').replace(',', ''))
test[['DisbursementGross',  'GrAppv', 'SBA_Appv']] = test[['DisbursementGross',  'GrAppv', 'SBA_Appv']].applymap(lambda x: x.strip().replace('$', '').replace(',', ''))

Unnamed: 0,DisbursementGross,GrAppv,SBA_Appv
0,"$80,000.00","$80,000.00","$68,000.00"
1,"$287,000.00","$287,000.00","$229,600.00"
2,"$31,983.00","$30,000.00","$15,000.00"
3,"$229,000.00","$229,000.00","$229,000.00"
4,"$525,000.00","$525,000.00","$393,750.00"
...,...,...,...
995,"$196,000.00","$196,000.00","$196,000.00"
996,"$25,000.00","$25,000.00","$21,250.00"
997,"$169,910.00","$51,500.00","$25,750.00"
998,"$275,000.00","$275,000.00","$206,250.00"


In [4]:
train[['DisbursementGross', 'GrAppv', 'SBA_Appv']].head()

  train[['DisbursementGross',  'GrAppv', 'SBA_Appv']] = train[['DisbursementGross',  'GrAppv', 'SBA_Appv']].applymap(lambda x: x.strip().replace('$', '').replace(',', ''))
  test[['DisbursementGross',  'GrAppv', 'SBA_Appv']] = test[['DisbursementGross',  'GrAppv', 'SBA_Appv']].applymap(lambda x: x.strip().replace('$', '').replace(',', ''))


In [5]:
train.isnull().sum()

Unnamed: 0,DisbursementGross,GrAppv,SBA_Appv
0,80000.0,80000.0,68000.0
1,287000.0,287000.0,229600.0
2,31983.0,30000.0,15000.0
3,229000.0,229000.0,229000.0
4,525000.0,525000.0,393750.0


In [6]:
train.fillna('NULL', inplace=True)

Term                  0
NoEmp                 0
NewExist              0
CreateJob             0
RetainedJob           0
FranchiseCode         0
RevLineCr            24
LowDoc               11
DisbursementDate      3
MIS_Status            0
Sector                0
ApprovalDate          0
ApprovalFY            0
City                  0
State                 0
BankState             0
DisbursementGross     0
GrAppv                0
SBA_Appv              0
UrbanRural            0
dtype: int64

# Training

In [8]:
numeric_columns = train.select_dtypes(include=['number']).columns
train_numeric = train[numeric_columns]
X = train_numeric.drop('MIS_Status', axis=1)
y = train_numeric['MIS_Status']

In [8]:
lgbm = lgb.LGBMClassifier()
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(lgbm, X, y, cv=kf, scoring='f1_macro')

In [9]:
mean_cv_f1 = np.mean(cv_scores)
print(f'Mean CV F1 Score: {mean_cv_f1}')

[LightGBM] [Info] Number of positive: 716, number of negative: 84
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.895000 -> initscore=2.142863
[LightGBM] [Info] Start training from score 2.142863
[LightGBM] [Info] Number of positive: 716, number of negative: 84
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 254
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 9
[LightGBM] [Info] [binary:BoostFromSco

# Predict

In [13]:
lgbm.fit(X, y)
submit[1] = lgbm.predict(test[numeric_columns.drop('MIS_Status')])
modules.utils.make_submission(submit, "exp01", mean_cv_f1)

[LightGBM] [Info] Number of positive: 895, number of negative: 105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 280
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.895000 -> initscore=2.142863
[LightGBM] [Info] Start training from score 2.142863


In [11]:
submit.head()

[LightGBM] [Info] Number of positive: 895, number of negative: 105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 280
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.895000 -> initscore=2.142863
[LightGBM] [Info] Start training from score 2.142863


Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
42307,1
42308,1
42309,1
42310,1
42311,1
