# 概要

カメラ情報をCNNで扱い、その予測値とテーブルデータをLightGBMでstackingする手法のbaselineを公開します。

このベースラインは類似コンペの[1st and Future - Player Contact Detection](https://www.kaggle.com/competitions/nfl-player-contact-detection)を参考に作成しています。

このnotebookでは、LightGBMを学習、推論させています。  
CNNのnotebookは  
[[CV 0.2008/LB 0.2017] LightGBM + CNN stacking baseline (CNN only)](https://www.guruguru.science/competitions/25/discussions/03a365c7-27ce-490e-ab6f-e7788ce470c8/)  
です。

In [1]:
import warnings
import pandas as pd
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import datetime
import math

In [2]:
class CFG:
    # ============== comp exp name =============
    comp_name = 'atmacup_18'  # comp名 モデルの出力先フォルダ名とか

    comp_dir_path = './'
    comp_folder_name = 'atmacup_18'  # compのデータセットのフォルダ名
    #comp_dataset_path = f'{comp_dir_path}datasets/{comp_folder_name}/'
    comp_dataset_path = '/kaggle/input/atmacup18-dataset/atmaCup18_dataset/'
    
    exp_name = 'atmacup_18_gbdt' # notebook

    # ============== file path =============
    train_fold_dir = "/kaggle/input/atmacup-18-cnn-exp001/atmacup_18_cnn_exp001/"

    # ============== pred target =============
    target_size = 18 #18
    target_col = ['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', 'x_2', 'y_2',
                  'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5', 'z_5']


    # ============== fold =============
    n_fold = 3 #5
    skf_col = 'class'
    group_col = 'scene'
    fold_type = 'gkf'

    objective_cv = 'regression'  # 'binary', 'multiclass', 'regression'
    metric_direction = 'minimize'  # 'maximize', 'minimize'
    metrics = 'calc_mae_atmacup'

    # ============== ほぼ固定 =============
    seed = 42

    # ============== set dataset path =============
    if exp_name is not None:
        print('set dataset path')

        outputs_path = comp_dir_path +  \
            f'outputs/{comp_name}/{exp_name}/'

        submission_dir = outputs_path + 'submissions/'
        submission_path = submission_dir + f'submission_{exp_name}.csv'

        model_dir = outputs_path + \
            f'{comp_name}-models/'

        figures_dir = outputs_path + 'figures/'

        log_dir = outputs_path + 'logs/'
        log_path = log_dir + f'{exp_name}.txt'

set dataset path


# setting

In [3]:
import torch, random

# 乱数固定
def set_seed(seed=None, cudnn_deterministic=True):
    if seed is None:
        seed = 42

    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = cudnn_deterministic
    torch.backends.cudnn.benchmark = False

def make_dirs(cfg):
    for dir in [cfg.model_dir, cfg.figures_dir, cfg.submission_dir, cfg.log_dir]:
        os.makedirs(dir, exist_ok=True)

def cfg_init(cfg, mode='train'):
    set_seed(cfg.seed)

    if mode == 'train':
        make_dirs(cfg)

In [4]:
warnings.filterwarnings('ignore')

cfg_init(CFG)

# logger

In [5]:
# from common_utils.logger import init_logger, wandb_init, AverageMeter, timeSince
# from common_utils.settings import cfg_init

def init_logger(log_file):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [6]:

Logger = init_logger(log_file=CFG.log_path)

Logger.info('\-------- exp_info -----------------')
Logger.info(datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M:%S'))

\-------- exp_info -----------------
2024年11月22日 08:05:51


# eval function

In [7]:
def calc_mae_atmacup(y_true, y_pred):
    abs_diff = np.abs(y_true - y_pred)  # 各予測の差分の絶対値を計算して
    mae = np.mean(abs_diff.reshape(-1, ))  # 予測の差分の絶対値の平均を計算

    return mae

def get_result(result_df):

    pred_cols = [f'pred_{i}' for i in range(CFG.target_size)]

    preds = result_df[pred_cols].values
    labels = result_df[CFG.target_col].values

    eval_func = eval(CFG.metrics)
    best_score = eval_func(labels, preds)

    Logger.info(f'best_score: {best_score:<.4f}')
    return best_score

In [8]:
# train_df = pd.read_csv(CFG.comp_dataset_path + 'train_features.csv')
train_df = pd.read_csv(CFG.train_fold_dir + 'train_folds.csv')

# データ数を絞る、n_fold = 3

In [9]:
# train_dfの最初の5000行のみを使用
train_df = train_df.head(5000)

In [10]:
print(train_df.columns)

Index(['ID', 'vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake',
       'brakePressed', 'gas', 'gasPressed', 'gearShifter', 'leftBlinker',
       'rightBlinker', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', 'x_2', 'y_2',
       'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5', 'z_5',
       'scene', 'fold'],
      dtype='object')


In [11]:
train_df.head()

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,...,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,scene,fold
0,00066be8e20318869c38c66be466631a_320,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,drive,...,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,00066be8e20318869c38c66be466631a,0
1,00066be8e20318869c38c66be466631a_420,11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,drive,...,0.391343,0.036335,26.316489,0.843124,0.065,31.383814,1.42507,0.073083,00066be8e20318869c38c66be466631a,0
2,00066be8e20318869c38c66be466631a_520,10.472548,0.231099,-2.985105,-132.0,0.0,False,0.18,True,drive,...,-0.356932,0.058765,25.677387,-0.576985,0.102859,30.460033,-0.841894,0.152889,00066be8e20318869c38c66be466631a,0
3,000fb056f97572d384bae4f5fc1e0f28_120,6.055565,-0.117775,7.632668,173.0,0.0,False,0.0,False,drive,...,0.603145,0.031858,15.703514,0.960717,0.043479,19.311182,1.374655,0.058754,000fb056f97572d384bae4f5fc1e0f28,0
4,000fb056f97572d384bae4f5fc1e0f28_20,3.316744,1.276733,-31.725477,-114.0,0.0,False,0.255,True,drive,...,-0.381813,-0.003898,11.619313,-0.554488,0.011393,14.657048,-0.7788,0.044243,000fb056f97572d384bae4f5fc1e0f28,0


In [12]:
test_df = pd.read_csv(CFG.comp_dataset_path + 'test_features.csv')
#test_df = pd.read_csv('/kaggle/input/atmacup18-dataset/atmaCup18_dataset/test_features.csv')

In [13]:
test_df.head()

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker
0,012baccc145d400c896cb82065a93d42_120,3.374273,-0.01936,-34.008415,17.0,0.0,False,0.0,False,drive,False,False
1,012baccc145d400c896cb82065a93d42_220,2.441048,-0.022754,307.860077,295.0,0.0,True,0.0,False,drive,False,False
2,012baccc145d400c896cb82065a93d42_320,3.604152,-0.286239,10.774388,-110.0,0.0,True,0.0,False,drive,False,False
3,012baccc145d400c896cb82065a93d42_420,2.048902,-0.537628,61.045235,189.0,0.0,True,0.0,False,drive,True,False
4,01d738e799d260a10f6324f78023b38f_120,2.201528,-1.8986,5.740093,-41.0,0.0,True,0.0,False,drive,False,False


## preprocess

In [14]:
# train_df = get_fold(train_df, CFG)

In [15]:
def common_preprocess(target_df):
    # boolのcol
    bool_cols = ['brakePressed', 'gasPressed', 'leftBlinker', 'rightBlinker']
    print('bool_cols', bool_cols)
    target_df[bool_cols] = target_df[bool_cols].astype(int)

    target_df['scene'] = target_df['ID'].str.split('_').str[0]
    target_df['scene_sec'] = target_df['ID'].str.split('_').str[1].astype(int)

    count_df = target_df.groupby('scene').size()
    target_df['scene_count'] = target_df['scene'].map(count_df)
    return target_df

In [16]:
train_df = common_preprocess(train_df)
test_df = common_preprocess(test_df)

bool_cols ['brakePressed', 'gasPressed', 'leftBlinker', 'rightBlinker']
bool_cols ['brakePressed', 'gasPressed', 'leftBlinker', 'rightBlinker']


# add traffic light

In [17]:
#002aで修正##############
import os, json

ids = os.listdir(CFG.comp_dataset_path + 'traffic_lights')

traffic_lights = []
id_class_list = []
for id in ids:
    path = CFG.comp_dataset_path + f'traffic_lights/{id}'

    traffic_light = json.load(open(path))

    traffic_lights.append(traffic_light)

    for traffic_light in traffic_light:
        id_class_list.append((id.split('.')[0], traffic_light['class']))

# 信号機の状態を保持するデータフレーム
traffic_lights_state_df = pd.DataFrame(id_class_list, columns=['ID', 'class'])

# 信号機の数を保持するデータフレーム
counts = [len(traffic_light) for traffic_light in traffic_lights]
traffic_lights_count_df = pd.DataFrame({
    'ID': [id.split('.')[0] for id in ids],
    'traffic_lights_counts': counts
})

# 両方を統合
traffic_lights_df = pd.merge(traffic_lights_state_df, traffic_lights_count_df, on='ID', how='left')

# train_df, test_df に統合
train_df = pd.merge(train_df, traffic_lights_df, on='ID', how='left')
test_df = pd.merge(test_df, traffic_lights_df, on='ID', how='left')
###################

In [18]:
# import os, json

# ids = os.listdir(CFG.comp_dataset_path + 'traffic_lights')

# traffic_lights = []
# id_class_list = []
# for id in ids:
#     path = CFG.comp_dataset_path + f'traffic_lights/{id}'

#     traffic_light = json.load(open(path))

#     traffic_lights.append(traffic_light)

#     for traffic_light in traffic_light:
#         id_class_list.append((id.split('.')[0], traffic_light['class']))

# counts = [len(traffic_light) for traffic_light in traffic_lights]
# traffic_lights_df = pd.DataFrame(id_class_list, columns=['ID', 'class'])

In [19]:
traffic_lights_df['class'].value_counts()

class
green       5879
red         4915
empty       1352
yellow       682
straight     647
left         576
right        306
other         60
Name: count, dtype: int64

In [20]:
# ids = [id.split('.')[0] for id in ids]

# traffic_lights_df = pd.DataFrame({
#     'ID': ids,
#     'traffic_lights_counts': counts
# })

# train_df = pd.merge(train_df, traffic_lights_df, on='ID', how='left')
# test_df = pd.merge(test_df, traffic_lights_df, on='ID', how='left')

# add cnn oof

In [21]:
exp_names = ['atmacup_18_cnn_exp001']
oof_feat_cols = []

for exp_name in exp_names:
    _oof_feat_cols  = [f'{exp_name}_{c}' for c in CFG.target_col]   

    #path = f'./outputs/atmacup_18/{exp_name}/submissions/oof_cv.csv'
    path = '/kaggle/input/atmacup-18-cnn-exp001/atmacup_18_cnn_exp001/submissions/oof_cv.csv'
    cnn_train_df = pd.read_csv(path)

    ######001a#######
    # train_dfのIDに対応するcnn_train_dfをフィルタリング
    cnn_train_df = cnn_train_df[cnn_train_df['ID'].isin(train_df['ID'])]
    
    # train_dfのID順にcnn_train_dfを並び替える
    cnn_train_df = cnn_train_df.set_index('ID').reindex(train_df['ID']).reset_index()
    
    # インデックスをリセット
    cnn_train_df = cnn_train_df.reset_index(drop=True)
    # 比較実行
    print((train_df['ID'] == cnn_train_df['ID']).sum() / len(train_df))
    #################
    
    pred_cols = [f'pred_{i}' for i in range(CFG.target_size)]
    
    train_df[_oof_feat_cols] = cnn_train_df[pred_cols]
    print(_oof_feat_cols)
    print((train_df['ID'] == cnn_train_df['ID']).sum() / len(train_df))

    #path = f'./outputs/atmacup_18/{exp_name}/submissions/submission_oof.csv'
    path = '/kaggle/input/atmacup-18-cnn-exp001/atmacup_18_cnn_exp001/submissions/submission_oof.csv'
    cnn_test_df = pd.read_csv(path)
    pred_cols = CFG.target_col

    test_df[_oof_feat_cols] = cnn_test_df[pred_cols]

    oof_feat_cols.extend(_oof_feat_cols)   

1.0
['atmacup_18_cnn_exp001_x_0', 'atmacup_18_cnn_exp001_y_0', 'atmacup_18_cnn_exp001_z_0', 'atmacup_18_cnn_exp001_x_1', 'atmacup_18_cnn_exp001_y_1', 'atmacup_18_cnn_exp001_z_1', 'atmacup_18_cnn_exp001_x_2', 'atmacup_18_cnn_exp001_y_2', 'atmacup_18_cnn_exp001_z_2', 'atmacup_18_cnn_exp001_x_3', 'atmacup_18_cnn_exp001_y_3', 'atmacup_18_cnn_exp001_z_3', 'atmacup_18_cnn_exp001_x_4', 'atmacup_18_cnn_exp001_y_4', 'atmacup_18_cnn_exp001_z_4', 'atmacup_18_cnn_exp001_x_5', 'atmacup_18_cnn_exp001_y_5', 'atmacup_18_cnn_exp001_z_5']
1.0


In [22]:
print(f"train_dfの行数: {len(train_df)}")
print(f"cnn_train_dfの行数: {len(cnn_train_df)}")


train_dfの行数: 5848
cnn_train_dfの行数: 5848


# make shift feature

In [23]:
def make_shift_feature(target_df, use_feat_cols):
    shift_count = 1
    shift_range = list(range(-shift_count, shift_count+1))
    shift_range = [x for x in shift_range if x != 0]

    target_df['ori_idx'] = target_df.index

    target_df = target_df.sort_values(['scene', 'scene_sec']).reset_index(drop=True)

    shift_feat_cols = []
    for shift in shift_range:
        for col in use_feat_cols:
            shift_col = f'{col}_shift{shift}'
            target_df[shift_col] = target_df.groupby('scene')[col].shift(shift)
            shift_feat_cols.append(shift_col)

            diff_col = f'{col}_diff{shift}'
            target_df[diff_col] = target_df[col] - target_df[shift_col]
            shift_feat_cols.append(diff_col)

    target_df = target_df.sort_values('ori_idx').reset_index(drop=True)
    target_df = target_df.drop('ori_idx', axis=1)

    return target_df, shift_feat_cols

In [24]:

use_cols = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake',
       'brakePressed', 'gas', 'gasPressed',  'leftBlinker',
       'rightBlinker']

use_cols += oof_feat_cols

# train_df, shift_feat_cols = make_shift_feature(train_df, use_cols)
# test_df, shift_feat_cols = make_shift_feature(test_df, use_cols)

# 時系列特徴量の追加

In [25]:
def add_time_series_features(df, use_feat_cols, window=3):
    """
    時系列特徴量を作成。
    """
    for col in use_feat_cols:
        # 移動平均
        df[f'{col}_rolling_mean'] = df.groupby('scene')[col].transform(
            lambda x: x.rolling(window, min_periods=1).mean()
        )
        # 移動標準偏差
        df[f'{col}_rolling_std'] = df.groupby('scene')[col].transform(
            lambda x: x.rolling(window, min_periods=1).std()
        )
        # 累積合計と最小値・最大値
        df[f'{col}_cumsum'] = df.groupby('scene')[col].cumsum()
        df[f'{col}_cummin'] = df.groupby('scene')[col].cummin()
        df[f'{col}_cummax'] = df.groupby('scene')[col].cummax()
    return df


In [26]:
# シフト特徴量を作成
train_df, shift_feat_cols = make_shift_feature(train_df, use_cols)
test_df, shift_feat_cols = make_shift_feature(test_df, use_cols)

# 002a追加特徴量作成

In [27]:
#1. 時系列のラグ特徴量・変化率
def add_lag_features(df, cols, group_col='scene'):
    """
    ラグ特徴量と差分を作成する。
    """
    for col in cols:
        df[f'lag_{col}'] = df.groupby(group_col)[col].shift(1)
        df[f'diff_{col}'] = df[col] - df[f'lag_{col}']
    return df


In [28]:
#2. シーン単位の集約特徴量
def add_scene_agg_features(df, cols, group_col='scene'):
    """
    シーン単位の集約特徴量を追加する。
    """
    group = df.groupby(group_col)
    for col in cols:
        df[f'{col}_mean'] = group[col].transform('mean')
        df[f'{col}_std'] = group[col].transform('std')
        df[f'{col}_max'] = group[col].transform('max')
        df[f'{col}_min'] = group[col].transform('min')
    return df


In [29]:
# カラムの一覧を表示
print(train_df.columns.tolist())

['ID', 'vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake', 'brakePressed', 'gas', 'gasPressed', 'gearShifter', 'leftBlinker', 'rightBlinker', 'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', 'x_2', 'y_2', 'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5', 'z_5', 'scene', 'fold', 'scene_sec', 'scene_count', 'class', 'traffic_lights_counts', 'atmacup_18_cnn_exp001_x_0', 'atmacup_18_cnn_exp001_y_0', 'atmacup_18_cnn_exp001_z_0', 'atmacup_18_cnn_exp001_x_1', 'atmacup_18_cnn_exp001_y_1', 'atmacup_18_cnn_exp001_z_1', 'atmacup_18_cnn_exp001_x_2', 'atmacup_18_cnn_exp001_y_2', 'atmacup_18_cnn_exp001_z_2', 'atmacup_18_cnn_exp001_x_3', 'atmacup_18_cnn_exp001_y_3', 'atmacup_18_cnn_exp001_z_3', 'atmacup_18_cnn_exp001_x_4', 'atmacup_18_cnn_exp001_y_4', 'atmacup_18_cnn_exp001_z_4', 'atmacup_18_cnn_exp001_x_5', 'atmacup_18_cnn_exp001_y_5', 'atmacup_18_cnn_exp001_z_5', 'vEgo_shift-1', 'vEgo_diff-1', 'aEgo_shift-1', 'aEgo_diff-1', 'steeringAngleDeg_shift-1', 'steeringAngleDeg_diff-1', 'st

In [30]:
#3. 信号機との相互作用特徴量

def add_signal_interaction_features(df):
    """
    統合済みの信号機情報を利用して信号機との相互作用特徴量を作成する。
    """
    # 信号の状態（ワンホットエンコード）
    df['signal_red'] = (df['class'] == 'red').astype(int)
    df['signal_green'] = (df['class'] == 'green').astype(int)

    # 信号機のカウント（例: 近くの信号機の数）
    df['traffic_lights_counts'] = df['traffic_lights_counts'].fillna(0).astype(int)

    # 信号の状態と速度の相互作用特徴量
    df['speed_signal_red'] = df['vEgo'] * df['signal_red']
    df['speed_signal_green'] = df['vEgo'] * df['signal_green']

    return df




In [31]:
#4. 速度×舵角の相互作用特徴量
def add_speed_steering_interaction(df):
    """
    速度と舵角の相互作用特徴量を作成する。
    """
    df['speed_steering_interaction'] = df['vEgo'] * df['steeringAngleDeg']
    return df


In [32]:
#全ての特徴量を管理する関数を統合
def apply_features(df, signal_df=None):
    # 時系列のラグ特徴量・変化率
    lag_cols = ['vEgo', 'aEgo']
    df = add_lag_features(df, lag_cols)

    # シーン単位の集約特徴量
    scene_cols = ['vEgo', 'aEgo']
    df = add_scene_agg_features(df, scene_cols)

    # 信号機との相互作用特徴量
    df = add_signal_interaction_features(df)  # 引数は df のみ

    # 速度×舵角の相互作用特徴量
    df = add_speed_steering_interaction(df)

    # # 深度情報の統計量
    # if depth_data is not None:
    #     df = add_depth_statistics(df, depth_data)

    return df


In [33]:
train_df = apply_features(train_df, signal_df=traffic_lights_df)
test_df = apply_features(test_df, signal_df=traffic_lights_df)

# feature block

In [34]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from typing import List

class AbstractBaseBlock:
    """
    https://www.guruguru.science/competitions/16/discussions/95b7f8ec-a741-444f-933a-94c33b9e66be/
    """

    def __init__(self) -> None:
        pass

    def fit(self, input_df: pd.DataFrame, y=None) -> pd.DataFrame:
        # return self.transform(input_df)
        raise NotImplementedError()

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        raise NotImplementedError()


def run_block(input_df: pd.DataFrame, blocks: List[AbstractBaseBlock], is_fit):
    output_df = pd.DataFrame()
    for block in blocks:
        name = block.__class__.__name__

        if is_fit:
            # print(f'fit: {name}')
            _df = block.fit(input_df)
        else:
            # print(f'transform: {name}')
            _df = block.transform(input_df)

        # print(f'concat: {name}')
        output_df = pd.concat([output_df, _df], axis=1)
    return output_df

class NumericBlock(AbstractBaseBlock):
    def __init__(self, col: str) -> None:
        super().__init__()
        self.col = col

    def fit(self, input_df):
        return self.transform(input_df)

    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df[self.col] = input_df[self.col].copy()
        return output_df

class LabelEncodingBlock(AbstractBaseBlock):
    def __init__(self, col: str) -> None:
        super().__init__()
        self.col = col
        self.encoder = LabelEncoder()

    def fit(self, input_df):
        # return self.transform(input_df)

        self.encoder.fit(input_df[self.col])
        return self.transform(input_df)

    def transform(self, input_df):
        output_df = pd.DataFrame()

        # output_df[self.col] = self.encoder.fit_transform(input_df[self.col])

        # self.encoder.fit(input_df[self.col])
        output_df[self.col] = self.encoder.transform(input_df[self.col])
        return output_df.add_suffix('@le')

class CountEncodingBlock(AbstractBaseBlock):
    def __init__(self, col: str) -> None:
        super().__init__()
        self.col = col

    def fit(self, input_df):
        self.val_count_dict = {}
        self.val_count = input_df[self.col].value_counts()
        return self.transform(input_df)

    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df[self.col] = input_df[self.col].map(self.val_count)
        return output_df.add_suffix('@ce')

# make feature

In [35]:
#002a追加#######
# ======= train_df, test_df 共通の処理 =======

# 基本の数値特徴量
base_cols = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake',
             'brakePressed', 'gas', 'gasPressed', 'leftBlinker', 'rightBlinker', 'scene_sec']

# 新規の特徴量
lag_cols = [f'lag_{col}' for col in ['vEgo', 'aEgo']]
diff_cols = [f'diff_{col}' for col in ['vEgo', 'aEgo']]
agg_cols = [f'{col}_{stat}' for col in ['vEgo', 'aEgo'] for stat in ['mean', 'std', 'max', 'min']]
interaction_cols = ['speed_steering_interaction']
signal_interaction_cols = [ 'signal_red', 'signal_green', 'speed_signal_red', 
                           'speed_signal_green', 'traffic_lights_counts']

# 特徴量管理
USE_LAG_FEATURES = True
USE_DIFF_FEATURES = True
USE_AGG_FEATURES = True
USE_INTERACTION_FEATURES = True
USE_SIGNAL_FEATURES = True

# 動的に使用する特徴量を構築
num_cols = base_cols + oof_feat_cols + shift_feat_cols + ['scene_count']
if USE_LAG_FEATURES:
    num_cols += lag_cols
if USE_DIFF_FEATURES:
    num_cols += diff_cols
if USE_AGG_FEATURES:
    num_cols += agg_cols
if USE_INTERACTION_FEATURES:
    num_cols += interaction_cols
if USE_SIGNAL_FEATURES:
    num_cols += signal_interaction_cols

# 出力確認
print(f'使用する数値特徴量: {num_cols}')

# 信号機関連の特徴量を追加
train_df = add_signal_interaction_features(train_df)
test_df = add_signal_interaction_features(test_df)

# train_dfとtest_dfを結合
train_num = len(train_df)  # train_dfの行数
whole_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)


# カテゴリ変数の定義
cat_label_cols = ['gearShifter']
cat_count_cols = []
cat_te_cols = []

# エンコード処理
blocks = [
    *[NumericBlock(col) for col in num_cols],  # 動的に構築された数値特徴量
    *[LabelEncodingBlock(col) for col in cat_label_cols],
    *[CountEncodingBlock(col) for col in cat_count_cols],
]

# 特徴量の構築
whole_feat_df = run_block(whole_df, blocks, is_fit=True)

# train_df, test_dfに分割
train_df, test_df = whole_df.iloc[:train_num], whole_df.iloc[train_num:].drop(
    columns=CFG.target_col).reset_index(drop=True)

train_feat, test_feat = whole_feat_df.iloc[:train_num], whole_feat_df.iloc[train_num:].reset_index(
    drop=True)

# 最終確認
print('最終的に使用する特徴量数:', len(train_feat.columns))

######################


使用する数値特徴量: ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake', 'brakePressed', 'gas', 'gasPressed', 'leftBlinker', 'rightBlinker', 'scene_sec', 'atmacup_18_cnn_exp001_x_0', 'atmacup_18_cnn_exp001_y_0', 'atmacup_18_cnn_exp001_z_0', 'atmacup_18_cnn_exp001_x_1', 'atmacup_18_cnn_exp001_y_1', 'atmacup_18_cnn_exp001_z_1', 'atmacup_18_cnn_exp001_x_2', 'atmacup_18_cnn_exp001_y_2', 'atmacup_18_cnn_exp001_z_2', 'atmacup_18_cnn_exp001_x_3', 'atmacup_18_cnn_exp001_y_3', 'atmacup_18_cnn_exp001_z_3', 'atmacup_18_cnn_exp001_x_4', 'atmacup_18_cnn_exp001_y_4', 'atmacup_18_cnn_exp001_z_4', 'atmacup_18_cnn_exp001_x_5', 'atmacup_18_cnn_exp001_y_5', 'atmacup_18_cnn_exp001_z_5', 'vEgo_shift-1', 'vEgo_diff-1', 'aEgo_shift-1', 'aEgo_diff-1', 'steeringAngleDeg_shift-1', 'steeringAngleDeg_diff-1', 'steeringTorque_shift-1', 'steeringTorque_diff-1', 'brake_shift-1', 'brake_diff-1', 'brakePressed_shift-1', 'brakePressed_diff-1', 'gas_shift-1', 'gas_diff-1', 'gasPressed_shift-1', 'gasPressed_diff-1', 'l

In [36]:
# # ======= train_df, test_df 共通の処理 =======

# num_cols = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake',
#        'brakePressed', 'gas', 'gasPressed',  'leftBlinker',
#        'rightBlinker']
# num_cols += ['scene_sec']
# num_cols += oof_feat_cols
# num_cols += shift_feat_cols
# num_cols += ['scene_count']

# # #001追加######
# # time_series_cols = [f'{col}_rolling_mean' for col in use_cols] + \
# #                    [f'{col}_rolling_std' for col in use_cols] + \
# #                    [f'{col}_cumsum' for col in use_cols] + \
# #                    [f'{col}_cummin' for col in use_cols] + \
# #                    [f'{col}_cummax' for col in use_cols]

# # num_cols += time_series_cols
# # ##############

# agg_num_cols = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake', 'gas']

# cat_label_cols = ['gearShifter']
# cat_count_cols = []
# cat_te_cols = []

# train_num = len(train_df)
# whole_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# blocks = [
#     *[NumericBlock(col) for col in num_cols],
#     *[LabelEncodingBlock(col) for col in cat_label_cols],
#     *[CountEncodingBlock(col) for col in cat_count_cols],
#     # *[AggBlock(col, target_cols=agg_num_cols,
#     #            agg_cols=['mean', 'max', 'min', 'std']) for col in ['scene']],
# ]
# whole_feat_df = run_block(whole_df, blocks, is_fit=True)

# # ======= train_df, test_df 別々に処理 =======

# train_df, test_df = whole_df.iloc[:train_num], whole_df.iloc[train_num:].drop(
#     columns=CFG.target_col).reset_index(drop=True)
# train_feat, test_feat = whole_feat_df.iloc[:train_num], whole_feat_df.iloc[train_num:].reset_index(
#     drop=True)

# blocks = [
#     # *[TargetEncodingBlock(col, CFG.target_col) for col in cat_te_cols]
# ]

# _df = run_block(train_df, blocks, is_fit=True)
# train_feat = pd.concat([train_feat, _df], axis=1)
# _df = run_block(test_df, blocks, is_fit=False)
# test_feat = pd.concat([test_feat, _df], axis=1)

# print('use_col len', len(train_feat.columns))

In [37]:
y = train_df[CFG.target_col]
folds = train_df['fold']

In [38]:
train_feat.shape, test_feat.shape

((5848, 161), (1978, 161))

In [39]:
train_null_df = train_feat.isnull().sum()
print(train_null_df)
train_null_df.to_csv(CFG.outputs_path + 'train_null.csv')

vEgo                     0
aEgo                     0
steeringAngleDeg         0
steeringTorque           0
brake                    0
                        ..
signal_green             0
speed_signal_red         0
speed_signal_green       0
traffic_lights_counts    0
gearShifter@le           0
Length: 161, dtype: int64


In [40]:
test_null_df = test_feat.isnull().sum()
print(test_null_df)
test_null_df.to_csv(CFG.outputs_path + 'test_null.csv')

vEgo                     0
aEgo                     0
steeringAngleDeg         0
steeringTorque           0
brake                    0
                        ..
signal_green             0
speed_signal_red         0
speed_signal_green       0
traffic_lights_counts    0
gearShifter@le           0
Length: 161, dtype: int64


# model

時系列特徴量が増えることで、LightGBMの深さ（max_depth）や葉の数（num_leaves）を増やすことで性能が向上する可能性があります。

'num_leaves': 128,  # デフォルトより増加

'max_depth': 7,     # デフォルトより増加

In [41]:
import lightgbm as lgb

class LightGBM:

    def __init__(self, lgb_params, save_dir=None, imp_dir=None, categorical_feature=None,
                 model_name='lgb',
                 stopping_rounds=50) -> None:
        self.save_dir = save_dir
        self.imp_dir = imp_dir
        self.lgb_params = lgb_params
        self.categorical_feature = categorical_feature

        # saveの切り替え用
        self.model_name = model_name

        self.stopping_rounds = stopping_rounds

    def fit(self, x_train, y_train, **fit_params) -> None:

        X_val, y_val = fit_params['eval_set'][0]
        del fit_params['eval_set']

        train_dataset = lgb.Dataset(
            x_train, y_train, categorical_feature=self.categorical_feature)

        val_dataset = lgb.Dataset(
            X_val, y_val, categorical_feature=self.categorical_feature)

        self.model = lgb.train(params=self.lgb_params,
                               train_set=train_dataset,
                               valid_sets=[train_dataset, val_dataset],
                               callbacks=[lgb.early_stopping(stopping_rounds=self.stopping_rounds,
                                                             verbose=True),
                                          lgb.log_evaluation(500)],
                               **fit_params
                               )

    def save(self, fold):
        save_to = f'{self.save_dir}lgb_fold_{fold}_{self.model_name}.txt'
        self.model.save_model(save_to)

    def predict(self, x):
        return self.model.predict(x)

    def predict_proba(self, x):
        return self.model.predict_proba(x)
    
def get_model(model_name):
    lgb_params = {'objective': CFG.objective_cv, 'boosting_type': 'gbdt', 'verbose': -1,
                'n_jobs': 8, 'seed': CFG.seed, 'learning_rate': 0.01,
                # 'num_class': CFG.num_class, # multiclassなら必要
                'metric': 'mae',
                'num_leaves': 64,
                'max_depth': 5,
                'bagging_seed': CFG.seed,
                'feature_fraction_seed': CFG.seed,
                'drop_seed': CFG.seed,
                }
    model = LightGBM(lgb_params=lgb_params,
                    imp_dir=CFG.figures_dir, save_dir=CFG.model_dir,
                    model_name=model_name)

    return model

def get_fit_params(model_name):
    params = {
        'num_boost_round': 100000
    }

    return params

In [42]:
def main(train_df, X_train, y, folds, test_df):
    eval_func = eval(CFG.metrics)

    oof_predictions = np.zeros((X_train.shape[0], CFG.target_size))
    test_predictions = np.zeros((test_df.shape[0], CFG.target_size))

    for target_idx in range(CFG.target_size):
        Logger.info(f'target {target_idx}')

        for fold in range(CFG.n_fold):
            Logger.info(f'Training fold {fold + 1}')
            target_col = CFG.target_col[target_idx]

            model_name = f'lgb_{target_col}'
            model = get_model(model_name)
            fit_params = get_fit_params(model_name)

            trn_ind = folds != fold
            val_ind = folds == fold

            x_train, x_val = X_train.loc[trn_ind], X_train.loc[val_ind]
            y_train, y_val = y.loc[trn_ind, target_col], y.loc[val_ind, target_col]
            eval_set = [(x_val, y_val)]

            fit_params_fold = fit_params.copy()
            fit_params_fold['eval_set'] = eval_set

            model.fit(x_train, y_train, **fit_params_fold)

            if hasattr(model, 'save'):
                model.save(fold)
            if hasattr(model, 'plot_importance'):
                model.plot_importance(fold)

            oof_predictions[val_ind, target_idx] = model.predict(x_val)

            test_predictions[:, target_idx] += model.predict(test_df)

    score = eval_func(y.values, oof_predictions)
    Logger.info(f'oof result {score}')

    # model.plot_importance_all(n_fold=CFG.n_fold)

    pred_cols = [f'pred_{i}' for i in range(CFG.target_size)]

    oof = train_df.copy()
    oof[pred_cols] = oof_predictions
    oof[CFG.target_col] = y

    oof_feat = X_train.copy()
    oof_feat[pred_cols] = oof_predictions
    oof_feat[CFG.target_col] = y

    get_result(oof)

    # save
    oof.to_csv(CFG.submission_dir + 'oof_gbdt.csv', index=False)
    oof_feat.to_csv(CFG.submission_dir + 'oof_feat_gbdt.csv', index=False)

    test_predictions /= CFG.n_fold

    test_df[CFG.target_col] = test_predictions
    test_df.to_csv(CFG.submission_dir +
                'submission_oof.csv', index=False)
    test_df[CFG.target_col].to_csv(
        CFG.submission_dir + f'submission_{CFG.exp_name}.csv', index=False)

    #sample_sub = pd.read_csv(CFG.comp_dataset_path + 'atmaCup18__sample_submit.csv')
    sample_sub = pd.read_csv('/kaggle/input/atmacup18-sample-submit/atmaCup18__sample_submit.csv')
    print('sample_sub_len: ', len(sample_sub))
    print('sub_len: ', len(test_df))

In [43]:
main(train_df, train_feat, y, folds, test_feat)

target 0
Training fold 1


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.0563357	valid_1's l1: 0.0722196


Training fold 2


Early stopping, best iteration is:
[779]	training's l1: 0.047682	valid_1's l1: 0.0700422
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.056399	valid_1's l1: 0.067806


Training fold 3


Early stopping, best iteration is:
[699]	training's l1: 0.0493389	valid_1's l1: 0.0654313
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.0569031	valid_1's l1: 0.0654257


target 1
Training fold 1


Early stopping, best iteration is:
[637]	training's l1: 0.0510308	valid_1's l1: 0.0633623
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[357]	training's l1: 0.0280023	valid_1's l1: 0.0363021
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[334]	training's l1: 0.0292126	valid_1's l1: 0.0317876
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.0279983	valid_1's l1: 0.0308717


target 2
Training fold 1


Early stopping, best iteration is:
[472]	training's l1: 0.0282094	valid_1's l1: 0.0308311
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[3]	training's l1: 0.0273492	valid_1's l1: 0.0265324
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[137]	training's l1: 0.0248475	valid_1's l1: 0.0271638
Training until validation scores don't improve for 50 rounds


target 3
Training fold 1


Early stopping, best iteration is:
[59]	training's l1: 0.025835	valid_1's l1: 0.0272619
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.119844	valid_1's l1: 0.167989


Training fold 2


Early stopping, best iteration is:
[782]	training's l1: 0.100193	valid_1's l1: 0.160946
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.122412	valid_1's l1: 0.145488
Early stopping, best iteration is:
[852]	training's l1: 0.10217	valid_1's l1: 0.138046


Training fold 3


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.122489	valid_1's l1: 0.145169


target 4
Training fold 1


Early stopping, best iteration is:
[721]	training's l1: 0.104632	valid_1's l1: 0.139098
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[412]	training's l1: 0.060649	valid_1's l1: 0.0807518
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[405]	training's l1: 0.0625265	valid_1's l1: 0.0713102
Training until validation scores don't improve for 50 rounds


target 5
Training fold 1


Early stopping, best iteration is:
[397]	training's l1: 0.063095	valid_1's l1: 0.0720621
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[33]	training's l1: 0.0548401	valid_1's l1: 0.0555093
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[183]	training's l1: 0.0504297	valid_1's l1: 0.0548676
Training until validation scores don't improve for 50 rounds


target 6
Training fold 1


Early stopping, best iteration is:
[41]	training's l1: 0.0540878	valid_1's l1: 0.0567847
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.202857	valid_1's l1: 0.298742
[1000]	training's l1: 0.1625	valid_1's l1: 0.287311
Early stopping, best iteration is:
[957]	training's l1: 0.164058	valid_1's l1: 0.287244


Training fold 2


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.206139	valid_1's l1: 0.257592
Early stopping, best iteration is:
[892]	training's l1: 0.169467	valid_1's l1: 0.244995


Training fold 3


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.209535	valid_1's l1: 0.250457


target 7
Training fold 1


Early stopping, best iteration is:
[821]	training's l1: 0.175617	valid_1's l1: 0.239123
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[447]	training's l1: 0.102333	valid_1's l1: 0.135996
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.101876	valid_1's l1: 0.127305


Training fold 3


Early stopping, best iteration is:
[501]	training's l1: 0.10182	valid_1's l1: 0.127296
Training until validation scores don't improve for 50 rounds


target 8
Training fold 1


Early stopping, best iteration is:
[402]	training's l1: 0.106554	valid_1's l1: 0.126806
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[68]	training's l1: 0.0815942	valid_1's l1: 0.0837423
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[176]	training's l1: 0.0762934	valid_1's l1: 0.0852242
Training until validation scores don't improve for 50 rounds


target 9
Training fold 1


Early stopping, best iteration is:
[59]	training's l1: 0.0812482	valid_1's l1: 0.087576
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.314951	valid_1's l1: 0.494067
[1000]	training's l1: 0.255143	valid_1's l1: 0.474815
Early stopping, best iteration is:
[1112]	training's l1: 0.248592	valid_1's l1: 0.474555


Training fold 2


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.325908	valid_1's l1: 0.412176
[1000]	training's l1: 0.262196	valid_1's l1: 0.389355
Early stopping, best iteration is:
[1142]	training's l1: 0.253098	valid_1's l1: 0.388801


Training fold 3


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.328271	valid_1's l1: 0.40295


target 10
Training fold 1


Early stopping, best iteration is:
[749]	training's l1: 0.28544	valid_1's l1: 0.383643
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[393]	training's l1: 0.163878	valid_1's l1: 0.213028
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.159881	valid_1's l1: 0.208724


Training fold 3


Early stopping, best iteration is:
[565]	training's l1: 0.15629	valid_1's l1: 0.208417
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.160249	valid_1's l1: 0.203164


target 11
Training fold 1


Early stopping, best iteration is:
[551]	training's l1: 0.157994	valid_1's l1: 0.202982
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[86]	training's l1: 0.108907	valid_1's l1: 0.112727
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[124]	training's l1: 0.1063	valid_1's l1: 0.113587
Training until validation scores don't improve for 50 rounds


target 12
Training fold 1


Early stopping, best iteration is:
[85]	training's l1: 0.107749	valid_1's l1: 0.117827
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.461065	valid_1's l1: 0.731748
[1000]	training's l1: 0.37683	valid_1's l1: 0.703396
Early stopping, best iteration is:
[1173]	training's l1: 0.361664	valid_1's l1: 0.702486


Training fold 2


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.480795	valid_1's l1: 0.610081
Early stopping, best iteration is:
[913]	training's l1: 0.399212	valid_1's l1: 0.584972


Training fold 3


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.486742	valid_1's l1: 0.597368
Early stopping, best iteration is:
[943]	training's l1: 0.403755	valid_1's l1: 0.567386


target 13
Training fold 1


Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[384]	training's l1: 0.241996	valid_1's l1: 0.317965
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.233472	valid_1's l1: 0.313135


Training fold 3


Early stopping, best iteration is:
[604]	training's l1: 0.224575	valid_1's l1: 0.312776
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.235624	valid_1's l1: 0.300864


target 14
Training fold 1


Early stopping, best iteration is:
[561]	training's l1: 0.231574	valid_1's l1: 0.300377
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[94]	training's l1: 0.137485	valid_1's l1: 0.141365
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[134]	training's l1: 0.134248	valid_1's l1: 0.142987
Training until validation scores don't improve for 50 rounds


target 15
Training fold 1


Early stopping, best iteration is:
[89]	training's l1: 0.136474	valid_1's l1: 0.14943
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.639705	valid_1's l1: 1.02077
[1000]	training's l1: 0.520811	valid_1's l1: 0.977932
[1500]	training's l1: 0.459413	valid_1's l1: 0.971979
Early stopping, best iteration is:
[1623]	training's l1: 0.445246	valid_1's l1: 0.971232


Training fold 2


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.666725	valid_1's l1: 0.851602
Early stopping, best iteration is:
[912]	training's l1: 0.551403	valid_1's l1: 0.817385


Training fold 3


Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.678408	valid_1's l1: 0.835329
[1000]	training's l1: 0.55573	valid_1's l1: 0.795075
[1500]	training's l1: 0.486691	valid_1's l1: 0.789833
Early stopping, best iteration is:
[1610]	training's l1: 0.473253	valid_1's l1: 0.788658


target 16
Training fold 1


Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[396]	training's l1: 0.335782	valid_1's l1: 0.450468
Training until validation scores don't improve for 50 rounds
[500]	training's l1: 0.327111	valid_1's l1: 0.440664


Training fold 3


Early stopping, best iteration is:
[566]	training's l1: 0.319612	valid_1's l1: 0.439559
Training until validation scores don't improve for 50 rounds


target 17
Training fold 1


Early stopping, best iteration is:
[399]	training's l1: 0.344396	valid_1's l1: 0.428506
Training until validation scores don't improve for 50 rounds


Training fold 2


Early stopping, best iteration is:
[98]	training's l1: 0.167017	valid_1's l1: 0.173208
Training until validation scores don't improve for 50 rounds


Training fold 3


Early stopping, best iteration is:
[170]	training's l1: 0.160583	valid_1's l1: 0.174239
Training until validation scores don't improve for 50 rounds


oof result 2.2910163071602834
best_score: 2.2910


Early stopping, best iteration is:
[196]	training's l1: 0.156394	valid_1's l1: 0.181985
sample_sub_len:  1727
sub_len:  1978


In [44]:
oof = pd.read_csv(CFG.submission_dir + 'oof_feat_gbdt.csv')
sub_oof = pd.read_csv(CFG.submission_dir + f'submission_oof.csv')

In [45]:
for i, col in enumerate(CFG.target_col):
    y_true = oof[col].values
    y_pred = oof[f'pred_{i}'].values

    score = calc_mae_atmacup(y_true, y_pred)
    Logger.info(f'{col} score: {score}')

x_0 score: 1.6955865924063211
y_0 score: 0.042698099059773176
z_0 score: 0.0271043710643319
x_1 score: 3.579663340793874
y_1 score: 0.10935320723674781
z_1 score: 0.0561997622126286
x_2 score: 5.478214401283882
y_2 score: 0.2020748178888517
z_2 score: 0.08607276819983593
x_3 score: 7.399346868755661
y_3 score: 0.32644771145155244
z_3 score: 0.1166650863412536
x_4 score: 9.339573970145995
y_4 score: 0.4822157899148475
z_4 score: 0.1482446078329665
x_5 score: 11.2971343629303
y_5 score: 0.6704619173934191
z_5 score: 0.1812358539728598


# スコアなど確認

In [46]:
importance = pd.DataFrame({
    'feature': train_feat.columns,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False)
print(importance.head(20))


NameError: name 'model' is not defined

In [None]:
# # LightGBMモデルで特徴量重要度を確認
# import lightgbm as lgb

# train_X = train_df.drop(columns=CFG.target_col)
# train_y = train_df[CFG.target_col]
# model = lgb.LGBMRegressor()
# model.fit(train_X, train_y)

# # 特徴量重要度を確認
# importance = pd.DataFrame({
#     'feature': train_X.columns,
#     'importance': model.feature_importances_
# }).sort_values(by='importance', ascending=False)

# print(importance.head(20))  # 上位20特徴量を表示


In [None]:
# # ターゲットごとのスコアをDataFrameで表示
# scores = []
# for i, col in enumerate(CFG.target_col):
#     y_true = oof[col].values
#     y_pred = oof[f'pred_{i}'].values

#     score = calc_mae_atmacup(y_true, y_pred)
#     scores.append({'Target': col, 'MAE': score})

#     Logger.info(f'{col} score: {score}')

# # DataFrameに変換
# score_df = pd.DataFrame(scores)

# # 表示
# print(score_df)


In [None]:
# # 全体スコアの計算
# score = eval_func(y.values, oof_predictions)
# print(f"Overall OOF Score (MAE): {score:.4f}")  # 見やすい表示
