In [1]:
from pathlib import Path

import polars as pl
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

In [3]:
exp_name = '003_LightGBM_Baseline'

In [4]:
DATA_PATH = Path('/Users/gouyashuto/localrepository/atmacup18/input')
OUTPUT_DIR = Path('/Users/gouyashuto/localrepository/atmacup18/output')

In [5]:
train_df = pl.read_csv(DATA_PATH / 'train_features.csv')
test_df = pl.read_csv(DATA_PATH / 'test_features.csv')

print(train_df.shape)
print(test_df.shape)

(43371, 30)
(1727, 12)


In [6]:
CAT_COLS = ['gearShifter', 'scene']

TARGET_COLS = [
    'x_0', 'y_0', 'z_0',
    'x_1', 'y_1', 'z_1',
    'x_2', 'y_2', 'z_2',
    'x_3', 'y_3', 'z_3',
    'x_4', 'y_4', 'z_4',
    'x_5', 'y_5', 'z_5'
]

In [7]:
def get_agg_exprs(agg_cols) -> list[pl.Expr]:
    # 同一シーンから特徴量作成
    exprs = []
    exprs += [pl.col(agg_col).shift(-1).over("scene").alias(f"{agg_col}_shift-1") for agg_col in agg_cols] # 1ステップ前の時間の値
    exprs += [pl.col(agg_col).shift(1).over("scene").alias(f"{agg_col}_shift1") for agg_col in agg_cols] # 1ステップ後の時間の値
    exprs += [pl.col(agg_col).diff(-1).over("scene").alias(f"{agg_col}_diff-1") for agg_col in agg_cols] # 1ステップ前の時間の値との差分
    exprs += [pl.col(agg_col).diff(1).over("scene").alias(f"{agg_col}_diff1") for agg_col in agg_cols] # 1ステップ後の時間の値との差分
    exprs += [pl.col(agg_col).mean().over("scene").alias(f"{agg_col}_mean") for agg_col in agg_cols] # 同一シーンの平均値
    exprs += [pl.col(agg_col).std().over("scene").alias(f"{agg_col}_std") for agg_col in agg_cols] # 同一シーンの標準偏差
    exprs += [pl.col(agg_col).max().over("scene").alias(f"{agg_col}_max") for agg_col in agg_cols] # 同一シーンの最大値
    exprs += [pl.col(agg_col).min().over("scene").alias(f"{agg_col}_min") for agg_col in agg_cols] # 同一シーンの最小値
    return exprs

In [8]:
def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    agg_cols = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'gas'] # 同一シーンから集計する値のカラム名
    df = (
        df
        .with_columns(
            scene = pl.col('ID').str.split('_').list[0],
            decisecond = pl.col('ID').str.split('_').list[1].cast(pl.Int32),
        )
        .sort(['scene', 'decisecond'])
        .with_columns(get_agg_exprs(agg_cols))
    )
    return df

In [None]:
def train(X: pd.DataFrame, y: pd.DataFrame, target: str):
    lgb_params = {
        'objective': 'regression',
        'metric': 'mae',
        'learning_rate': 0.01,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_jobs': -1,
    }
    gkf = GroupKFold(n_splits=5)
    groups = X['scene']

    models = []
    oof = np.zeros(len(X))
    for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y[target], groups=groups)):
        print('=' * 10, f'fold: {fold} start' + '=' * 10)
        train_X = X.iloc[train_idx].drop(columns=['scene'])
        train_y = y[target].iloc[train_idx]
        valid_X = X.iloc[valid_idx].drop(columns=['scene'])
        valid_y = y[target].iloc[valid_idx]

        train_data = lgb.Dataset(train_X, train_y)
        valid_data = lgb.Dataset(valid_X, valid_y, reference=train_data)

        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=10000,
            callbacks=[lgb.early_stopping(100)],
        )
        oof[valid_idx] = model.predict(valid_X)
        models.append(model)
    return models, oof

def predict(X: pd.DataFrame, models: list):
    preds = []
    for model in models:
        pred = model.predict(X.drop(columns=['scene']))
        preds.append(pred)
    return np.mean(preds, axis=0)

def evaluate(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    return np.mean(np.abs(y_true - y_pred))

In [10]:
train_df = preprocess(train_df)

origin_test_ids = test_df['ID'].to_pandas()
test_df = preprocess(test_df)

print(train_df.shape)
print(test_df.shape)

(43371, 72)
(1727, 54)


In [11]:
remove_columns = ['ID']
X = train_df.drop(remove_columns + TARGET_COLS).to_pandas()
y = train_df[TARGET_COLS].to_pandas()

test_X = test_df.drop(remove_columns).to_pandas()

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[CAT_COLS] = oe.fit_transform(X[CAT_COLS])
test_X[CAT_COLS] = oe.transform(test_X[CAT_COLS])

print(X.shape)
print(test_X.shape)

(43371, 53)
(1727, 53)


In [12]:
%%time

models_dict = {}
oof = pd.DataFrame(np.zeros_like(train_df.to_pandas().loc[:, TARGET_COLS]), columns=TARGET_COLS)

preds = test_df.select(['ID']).to_pandas()
preds[TARGET_COLS] = 0.0

for target in TARGET_COLS:
    print('=' * 10, f'target: {target} start' + '=' * 10)
    partial_model, partial_oofs = train(X, y, target)
    models_dict[target] = partial_model
    oof.loc[:, target] = partial_oofs
    preds.loc[:, target] = predict(test_X, models_dict[target])

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[980]	valid_0's l1: 0.0618087
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[831]	valid_0's l1: 0.0604472
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[879]	valid_0's l1: 0.0633197
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1252]	valid_0's l1: 0.0617252
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1251]	valid_0's l1: 0.0623725
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[500]	valid_0's l1: 0.0324583
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[489]	valid_0's l1: 0.0318535
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[702]	valid_

In [13]:
for target in TARGET_COLS:
    print(f'{target} CV score: ', evaluate(y[target], oof[target]))
score = evaluate(y, oof)
print('Total CV score: ', score)

x_0 CV score:  0.06193463658709383
y_0 CV score:  0.03247537600638597
z_0 CV score:  0.025956946282137176
x_1 CV score:  0.1321245354288733
y_1 CV score:  0.0738589619561076
z_1 CV score:  0.054090912539904286
x_2 CV score:  0.2247071252047433
y_2 CV score:  0.13150689236672053
z_2 CV score:  0.08340867136678622
x_3 CV score:  0.34828743649531485
y_3 CV score:  0.2161142213121716
z_3 CV score:  0.11406105490110789
x_4 CV score:  0.501219062964356
y_4 CV score:  0.33238063614963015
z_4 CV score:  0.14586561308569806
x_5 CV score:  0.6754816385299667
y_5 CV score:  0.4810698611399575
z_5 CV score:  0.17921763564056822
Total CV score:  0.2118756232198624


In [14]:
submission = pd.DataFrame(origin_test_ids).merge(preds, on='ID', how='left').drop(columns=['ID'])

output_path = OUTPUT_DIR / f'{exp_name}_{score:.4f}_submission.csv'
if output_path.exists():
    assert False, f'output file already exists. {output_path}'

submission.to_csv(output_path, index=False)
submission

Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.452000,-0.050903,0.002467,3.026742,-0.119529,0.004675,4.544577,-0.190624,0.002538,5.831274,-0.207156,0.004310,7.260501,-0.123238,-0.005506,8.597789,-0.003056,-0.003567
1,0.943627,0.389217,-0.000403,1.740568,1.004104,-0.002847,2.405530,1.776283,-0.007880,2.792622,2.618284,-0.010512,3.564011,3.554118,-0.008003,4.331309,4.561916,0.000610
2,1.570422,0.016755,0.003852,3.247825,0.015135,0.007196,4.926408,0.044127,0.011402,6.314183,0.053395,0.012119,7.774819,0.119132,0.015144,8.932256,0.197387,0.021751
3,0.834648,0.064781,-0.001332,1.646419,0.220114,-0.006990,2.422596,0.541451,-0.012222,2.967670,0.854678,-0.019287,3.636090,1.512227,-0.021030,4.295287,2.240007,-0.016445
4,0.817229,0.004758,-0.001937,1.411591,0.002420,-0.004815,1.892143,-0.000868,-0.017368,2.266762,-0.003869,-0.038374,2.182538,-0.002873,-0.051923,1.739183,-0.022205,-0.069277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,6.536803,0.009177,0.017746,13.799471,0.048440,0.039545,21.005841,0.130692,0.065820,28.265819,0.220888,0.073843,35.563229,0.328178,0.080398,42.982216,0.452140,0.070313
1723,7.000525,0.000949,0.004384,14.893628,-0.009496,0.008282,22.941824,-0.028283,0.012853,31.099251,-0.064447,0.023554,39.282124,-0.098533,0.030263,47.521092,-0.133894,0.037108
1724,7.416801,-0.001224,0.007200,15.665913,-0.020713,0.011577,23.899243,-0.055145,0.022928,32.062863,-0.120521,0.047047,40.125042,-0.194544,0.077168,48.065559,-0.265605,0.090351
1725,6.524221,-0.000424,-0.002032,13.669658,-0.009155,-0.004767,20.764941,-0.009719,-0.009016,27.814852,-0.011040,-0.012842,34.837266,-0.019661,-0.015308,41.825263,0.012250,-0.015824
