# TODO

- 有効な特徴量とそうでない特徴量を整理する

# Setting

In [1]:
import sys

import numpy as np
import polars as pl
from typing import Final

root_dir = "../../"
if root_dir not in sys.path:
    sys.path.append(root_dir)
from modules import config as cfg
from modules import utils, preprosess, training, predict, metrics, workflow

exp = "exp24"
utils.set_seed(cfg.Params.seed)

# Read data

In [2]:
train, test, submit = utils.get_data(debug_mode=True)

# Preprocessing

## Split fold

In [3]:
train = preprosess.assign_stratified_k_fold(train)

## Convert

In [4]:
train = preprosess.drop_null_columns(train)
test  = preprosess.drop_null_columns(test)

# testのBankStateの"PR"はtrainにないため，一番多いCAに変換
test  = test.with_columns(pl.col("BankState").str.replace("PR", "CA"))

# 地名を座標に変換
train = preprosess.convert_to_latlng(train)
test  = preprosess.convert_to_latlng(test)

# DisbursementDateとApprovalDateを数値に変換
cols  = ["DisbursementDate", "ApprovalDate"]
train = preprosess.convert_date_to_year(train, cols)
test  = preprosess.convert_date_to_year(test, cols)
train = preprosess.convert_date_to_month(train, cols)
test  = preprosess.convert_date_to_month(test, cols)
train = preprosess.convert_date_to_day(train, cols)
test  = preprosess.convert_date_to_day(test, cols)

# 金額データの数値化
cols  = ["DisbursementGross",  "GrAppv", "SBA_Appv"]
train = preprosess.convert_money_data(train, cols)
test  = preprosess.convert_money_data(test, cols)

# 一致しているか
eqs = [
    ["State", "BankState"],
]
train = preprosess.add_eq(train, eqs)
test  = preprosess.add_eq(test, eqs)

# DisbursementDateとApprovalDateの差を計算
train = preprosess.add_diff_disbursement_with_approval(train)
test  = preprosess.add_diff_disbursement_with_approval(test)

# Sectorを職業別にする
train = preprosess.unify_same_sector(train)
test  = preprosess.unify_same_sector(test)

# ラベルデータをSBA_Appvの統計量で置き換える
AGG_NAME: Final[list] = ["mean", "median", "max", "min"]
AGG_COL: Final[str] = "SBA_Appv"
VS_AGG_COLS: Final[list] = [
    'NewExist', 
    'RevLineCr', 
    'LowDoc', 
    'UrbanRural', 
    'CreateJob'
]
train, test = preprosess.convert_aggregation_data(train, test, AGG_COL, VS_AGG_COLS)

# 差の計算
diffs = [
    ["DisbursementGross", "GrAppv"],   # 支払われた額 vs 銀行承認額
    ["DisbursementGross", "SBA_Appv"], # 支払われた額 vs SBA承認額
    ["GrAppv", "SBA_Appv"],            # 銀行承認額 vs SBA承認額
]

# ラベルデータをSBA_Appvの統計量で置き換えたもの vs SBA_Appv
for col_name in VS_AGG_COLS:
    for name in AGG_NAME:
        diff = [AGG_COL, f"{col_name}_{AGG_COL}_{name}"]
        diffs.append(diff)

train = preprosess.add_diff(train, diffs)
test  = preprosess.add_diff(test, diffs)

# 割合の計算
divs = [
    ["DisbursementGross", "Term"],
    ["GrAppv", "Term"],
    ["SBA_Appv", "Term"],
    ["NoEmp", "Term"],
    ["DisbursementGross", "NoEmp"],
    ["GrAppv", "NoEmp"],
    ["SBA_Appv", "NoEmp"],
    ["DisbursementGross_GrAppv_diff", "Term"],
    ["DisbursementGross_SBA_Appv_diff", "Term"],
    ["GrAppv_SBA_Appv_diff", "Term"],
    ["DisbursementGross_GrAppv_diff", "NoEmp"],
    ["DisbursementGross_SBA_Appv_diff", "NoEmp"],
    ["GrAppv_SBA_Appv_diff", "NoEmp"]
]
train = preprosess.add_div(train, divs)
test  = preprosess.add_div(test, divs)

## Target encoding

In [5]:
cols = [
    # "Sector",
    "UrbanRural",
    "RevLineCr",
    "LowDoc",
    "FranchiseCode",
]
train, test = preprosess.target_encoding(train, test, cols)

## Label encoding
CVによるtarget encodingでは同じカテゴリカル変数に異なる値が割り当てられるのでlabel encodingを併用する

In [6]:
cols = ["RevLineCr", "LowDoc"]
train, test = preprosess.label_encoding(train, test, cols)

# Drop columns

In [7]:
# 不要なカラムの削除
del_cols = [
    # 地名系
    "City",
    "State",
    "BankState",
    # d-b-y系
    "DisbursementDate",
    "ApprovalDate",
    "ApprovalFY",
    # ラベルエンコーディング済み
    "RevLineCr",
    "LowDoc",
]

for col_name in VS_AGG_COLS:
    for name in AGG_NAME:
        del_cols.append(f"{col_name}_{AGG_COL}_{name}")

train = train.drop(del_cols)
test  = test.drop(del_cols)

# Experiments

In [8]:
utils.ignore_user_warning()

In [9]:
lgb_params = {
    "objective": "binary",
    "metric": "None",
    "learning_rate": cfg.Params.lgb_learning_rate,
    "max_depth": cfg.Params.lgb_max_depth,
    "n_estimators": cfg.Params.lgb_n_estimators,
    "colsample_bytree": cfg.Params.lgb_colsample_bytree,
    "importance_type": "gain",
    "verbose": -1,
    "seed": cfg.Params.seed,
}
zero_weights = np.arange(0.74, 0.88, 0.01)
df = workflow.search_effective_feature(train, lgb_params=lgb_params, zero_weights=zero_weights)

--------------------------------------------------------------------------------
START fold 1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[285]	valid_0's macroF1: 0.686236
Time:  3.601[s]
--------------------------------------------------------------------------------
START fold 2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[227]	valid_0's macroF1: 0.615975
Time:  3.583[s]
--------------------------------------------------------------------------------
START fold 3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[141]	valid_0's macroF1: 0.671233
Time:  3.431[s]
--------------------------------------------------------------------------------
START fold 4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[197]	valid_0's macroF1: 0.711712
Time:  4.747[s]
------------------------------------

In [10]:
df

feature,cv_score,zero_weight,diff
str,f64,f64,f64
"""Term""",0.700957,0.85,0.004962
"""NoEmp""",0.709449,0.84,-0.003529
"""NewExist""",0.703619,0.86,0.0023
"""CreateJob""",0.697692,0.86,0.008228
"""RetainedJob""",0.700957,0.81,0.004962
"""FranchiseCode""",0.704379,0.78,0.00154
"""Sector""",0.707366,0.88,-0.001446
"""DisbursementGr…",0.706052,0.76,-0.000133
"""GrAppv""",0.70516,0.83,0.00076
"""SBA_Appv""",0.702646,0.77,0.003273


In [11]:
effective = df.filter(pl.col("diff") >= 0)["feature"].to_list()
ineffective = df.filter(pl.col("diff") < 0)["feature"].to_list()

print("Effective features:")
for feature in effective:
    print(f"- {feature}")
print("Ineffective features:")
for feature in ineffective:
    print(f"- {feature}")

Effective features:
- Term
- NewExist
- CreateJob
- RetainedJob
- FranchiseCode
- GrAppv
- SBA_Appv
- UrbanRural
- State_lng
- BankState_lat
- BankState_lng
- DisbursementDateYear
- ApprovalDateYear
- State_BankState_diff
- Disbursement_Approval_diff
- RevLineCr_SBA_Appv_std
- CreateJob_SBA_Appv_std
- DisbursementGross_GrAppv_diff
- GrAppv_SBA_Appv_diff
- SBA_Appv_NewExist_SBA_Appv_median_diff
- SBA_Appv_NewExist_SBA_Appv_min_diff
- SBA_Appv_LowDoc_SBA_Appv_mean_diff
- SBA_Appv_UrbanRural_SBA_Appv_median_diff
- SBA_Appv_UrbanRural_SBA_Appv_max_diff
- DisbursementGross_Term_div
- DisbursementGross_NoEmp_div
- DisbursementGross_SBA_Appv_diff_Term_div
- GrAppv_SBA_Appv_diff_NoEmp_div
- RevLineCr_MIS_Status_mean
- LowDoc_MIS_Status_mean
Ineffective features:
- NoEmp
- Sector
- DisbursementGross
- State_lat
- City_lat
- City_lng
- DisbursementDateMonth
- ApprovalDateMonth
- DisbursementDateDay
- ApprovalDateDay
- NewExist_SBA_Appv_std
- LowDoc_SBA_Appv_std
- UrbanRural_SBA_Appv_std
- Disbur

In [12]:
df.write_csv(f"{cfg.DirFile.tmp}exp24_effective_features.csv")