# TODO

- ゆうだいさんのスコアの再現確保
- 0.6840183034914059(シード変える前)

# Setting

In [1]:
import sys

import numpy as np
import polars as pl
from typing import Final

root_dir = "../../"
if root_dir not in sys.path:
    sys.path.append(root_dir)
from modules import config as cfg
from modules import utils, preprosess, training, predict, metrics, workflow

exp = "exp22"
utils.set_seed(cfg.Params.seed)

# Read data

In [2]:
train, test, submit = utils.get_data(debug_mode=False)

# Preprocessing

In [3]:
train = preprosess.drop_null_columns(train)
test  = preprosess.drop_null_columns(test)

## Split fold

In [4]:
train = preprosess.assign_stratified_k_fold(train)

## Convert

In [5]:
# testのBankStateの"PR"はtrainにないため，一番多いCAに変換
test = test.with_columns(pl.col("BankState").str.replace("PR", "CA"))

# 地名を座標に変換
train = preprosess.convert_to_latlng(train)
test  = preprosess.convert_to_latlng(test)

# DisbursementDateとApprovalDateを数値に変換
cols  = ["DisbursementDate", "ApprovalDate"]
train = preprosess.convert_date_to_year(train, cols)
test  = preprosess.convert_date_to_year(test, cols)
train = preprosess.convert_date_to_month(train, cols)
test  = preprosess.convert_date_to_month(test, cols)
train = preprosess.convert_date_to_day(train, cols)
test  = preprosess.convert_date_to_day(test, cols)

# 金額データの数値化
cols  = ["DisbursementGross",  "GrAppv", "SBA_Appv"]
train = preprosess.convert_money_data(train, cols)
test  = preprosess.convert_money_data(test, cols)

# 一致しているか
eqs = [
    ["State", "BankState"],
]
train = preprosess.add_eq(train, eqs)
test  = preprosess.add_eq(test, eqs)

# DisbursementDateとApprovalDateの差を計算
# train = preprosess.add_diff_disbursement_with_approval(train)
# test  = preprosess.add_diff_disbursement_with_approval(test)

# Sectorを職業別にする
train = preprosess.unify_same_sector(train)
test  = preprosess.unify_same_sector(test)

# ラベルデータをSBA_Appvの統計量で置き換える
AGG_NAME: Final[list] = ["mean", "median", "max", "min"]
AGG_COL: Final[str] = "SBA_Appv"
VS_AGG_COLS: Final[list] = [
    'NewExist', 
    'RevLineCr', 
    'LowDoc', 
    'UrbanRural', 
    'CreateJob'
]
train, test = preprosess.convert_aggregation_data(train, test, AGG_COL, VS_AGG_COLS)

# 差の計算
diffs = [
    ["DisbursementGross", "GrAppv"],   # 支払われた額 vs 銀行承認額
    ["DisbursementGross", "SBA_Appv"], # 支払われた額 vs SBA承認額
    ["GrAppv", "SBA_Appv"],            # 銀行承認額 vs SBA承認額
]

# ラベルデータをSBA_Appvの統計量で置き換えたもの vs SBA_Appv
for col_name in VS_AGG_COLS:
    for name in AGG_NAME:
        diff = [AGG_COL, f"{col_name}_{AGG_COL}_{name}"]
        diffs.append(diff)

train = preprosess.add_diff(train, diffs)
test  = preprosess.add_diff(test, diffs)

# 割合の計算
divs = [
    # ["DisbursementGross", "Term"], 
    ["GrAppv", "Term"], 
    ["SBA_Appv", "Term"], 
    ["NoEmp", "Term"], 
    # ["DisbursementGross", "NoEmp"], 
    # ["GrAppv", "NoEmp"], 
    # ["SBA_Appv", "NoEmp"], 
    ["DisbursementGross_GrAppv_diff", "Term"], 
    ["DisbursementGross_SBA_Appv_diff", "Term"], 
    ["GrAppv_SBA_Appv_diff", "Term"], 
    # ["DisbursementGross_GrAppv_diff", "NoEmp"], 
    # ["DisbursementGross_SBA_Appv_diff", "NoEmp"], 
    # ["GrAppv_SBA_Appv_diff", "NoEmp"]
]
train = preprosess.add_div(train, divs)
test  = preprosess.add_div(test, divs)

## Target encoding

In [6]:
cols = [
    # "Sector",
    "UrbanRural", 
    "RevLineCr", 
    "LowDoc", 
    "ApprovalFY", 
    "FranchiseCode",
    "RetainedJob",
    "CreateJob",
    "NoEmp",
    "Term",
    "State",
    "BankState",
    "Sector",
    "NewExist",
]
train, test = preprosess.target_encoding(train, test, cols)

## Label encoding
CVによるtarget encodingでは同じカテゴリカル変数に異なる値が割り当てられるのでlabel encodingを併用する

In [7]:
cols = ['RevLineCr', 'LowDoc', 'State', 'BankState']
train, test = preprosess.label_encoding(train, test, cols)

# Drop columns

In [8]:
# 不要なカラムの削除
del_cols = [
    # 地名系
    "City",
    # "State",
    # "BankState",
    # d-b-y系
    "DisbursementDate",
    "ApprovalDate",
    # "ApprovalFY",
    # ラベルエンコーディング済み
    # "RevLineCr",
    # "LowDoc",
]

for col_name in VS_AGG_COLS:
    for name in AGG_NAME:
        del_cols.append(f"{col_name}_{AGG_COL}_{name}")

train = train.drop(del_cols)
test  = test.drop(del_cols)

# Training

In [10]:
zero_weight = 0.81
lgb_params = {
    "objective": "binary",
    "metric": "None",
    "learning_rate": 0.01,
    "max_depth": 6,
    "n_estimators": 1000,
    "colsample_bytree": 0.7,
    "importance_type": "gain",
    "verbose": -1,
    "seed": cfg.Params.seed,
}
oof, models = training.fit_lgbm(train, zero_weight=zero_weight, lgb_params=lgb_params)

--------------------------------------------------------------------------------
START fold 1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[188]	valid_0's macroF1: 0.692998
Time:  13.157[s]
--------------------------------------------------------------------------------
START fold 2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[418]	valid_0's macroF1: 0.685313
Time:  23.271[s]
--------------------------------------------------------------------------------
START fold 3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[338]	valid_0's macroF1: 0.662823
Time:  15.673[s]
--------------------------------------------------------------------------------
START fold 4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[203]	valid_0's macroF1: 0.680522
Time:  10.773[s]
--------------------------------

In [11]:
threshold = 0.5
oof_truth = train[cfg.Cols.target].to_numpy()
oof_hat = predict.predict_class(oof, threshold=threshold)
cv_score = metrics.macro_f1_score(oof_truth, oof_hat)
print(cv_score)

0.6816977915267421
