In [1]:
from pathlib import Path

import polars as pl
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

In [2]:
exp_name = '001_stacking'

In [3]:
DATA_PATH = Path('/Users/gouyashuto/localrepository/atmacup18/input')
OUTPUT_DIR = Path('/Users/gouyashuto/localrepository/atmacup18/output')

In [4]:
train_df = pl.read_csv(DATA_PATH / 'train_features.csv')
test_df = pl.read_csv(DATA_PATH / 'test_features.csv')

print(train_df.shape)
print(test_df.shape)

(43371, 30)
(1727, 12)


In [5]:
CAT_COLS = ['gearShifter', 'scene']

TARGET_COLS = [
    'x_0', 'y_0', 'z_0',
    'x_1', 'y_1', 'z_1',
    'x_2', 'y_2', 'z_2',
    'x_3', 'y_3', 'z_3',
    'x_4', 'y_4', 'z_4',
    'x_5', 'y_5', 'z_5'
]

In [6]:
def get_agg_exprs(agg_cols) -> list[pl.Expr]:
    # 同一シーンから特徴量作成
    exprs = []
    exprs += [pl.col(agg_col).shift(-1).over("scene").alias(f"{agg_col}_shift-1") for agg_col in agg_cols] # 1ステップ前の時間の値
    exprs += [pl.col(agg_col).shift(1).over("scene").alias(f"{agg_col}_shift1") for agg_col in agg_cols] # 1ステップ後の時間の値
    exprs += [pl.col(agg_col).diff(-1).over("scene").alias(f"{agg_col}_diff-1") for agg_col in agg_cols] # 1ステップ前の時間の値との差分
    exprs += [pl.col(agg_col).diff(1).over("scene").alias(f"{agg_col}_diff1") for agg_col in agg_cols] # 1ステップ後の時間の値との差分
    exprs += [pl.col(agg_col).mean().over("scene").alias(f"{agg_col}_mean") for agg_col in agg_cols] # 同一シーンの平均値
    exprs += [pl.col(agg_col).std().over("scene").alias(f"{agg_col}_std") for agg_col in agg_cols] # 同一シーンの標準偏差
    exprs += [pl.col(agg_col).max().over("scene").alias(f"{agg_col}_max") for agg_col in agg_cols] # 同一シーンの最大値
    exprs += [pl.col(agg_col).min().over("scene").alias(f"{agg_col}_min") for agg_col in agg_cols] # 同一シーンの最小値
    return exprs

In [7]:
def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    agg_cols = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'gas'] # 同一シーンから集計する値のカラム名
    df = (
        df
        .with_columns(
            scene = pl.col('ID').str.split('_').list[0],
            decisecond = pl.col('ID').str.split('_').list[1].cast(pl.Int32),
        )
        .sort(['scene', 'decisecond'])
        .with_columns(get_agg_exprs(agg_cols))
    )
    return df

In [28]:
def train(X: pd.DataFrame, y: pd.DataFrame, target: str):
    lgb_params = {
        'objective': 'regression',
        'metric': 'mae',
        'learning_rate': 0.01,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1,
    }
    gkf = GroupKFold(n_splits=5)
    groups = X['scene']

    models = []
    oof = np.zeros(len(X))
    # for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y[target], groups=groups)):
    for fold in range(5):
        print('=' * 10, f'fold: {fold} start' + '=' * 10)
        train_X = X.loc[X.fold != fold].drop(columns=['scene', 'fold'] + [col for col in PREDICT_TARGET_COLS if not col.startswith(f"predict_{target.split('_')[0]}_")])
        train_y = y[target].loc[X.fold != fold]
        valid_X = X.loc[X.fold == fold].drop(columns=['scene', 'fold'] + [col for col in PREDICT_TARGET_COLS if not col.startswith(f"predict_{target.split('_')[0]}_")])
        valid_y = y[target].loc[X.fold == fold]

        train_data = lgb.Dataset(train_X, train_y)
        valid_data = lgb.Dataset(valid_X, valid_y, reference=train_data)

        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=10000,
            callbacks=[lgb.early_stopping(100)],
        )
        oof[X.fold == fold] = model.predict(valid_X)
        models.append(model)
    return models, oof

def predict(X: pd.DataFrame, target: str, models: dict[str, list]):
    preds = []
    for model in models[target]:
        pred = model.predict(X.drop(columns=['scene'] + [col for col in PREDICT_TARGET_COLS if not col.startswith(f"predict_{target.split('_')[0]}_")]))
        preds.append(pred)
    return np.mean(preds, axis=0)

def evaluate(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    return np.mean(np.abs(y_true - y_pred))

In [10]:
train_df = preprocess(train_df)

origin_test_ids = test_df['ID'].to_pandas()
test_df = preprocess(test_df)

print(train_df.shape)
print(test_df.shape)

(43371, 72)
(1727, 54)


In [11]:
train_df = train_df.to_pandas()
test_df = test_df.to_pandas()

In [12]:
# Sceneの名前を抜き出してDataFrameに加える
# train_df["scene"] = [i.split("_")[0] for i in train_df.ID]

# =======================================
# 5 Foldの作成
# =======================================

# train/val split
# SceneでFoldを分ける
scene_list = train_df.scene.unique()
fold_map = {}

for i, scene in enumerate(scene_list):
    fold_map[scene] = i % 5

train_df["fold"] = train_df.scene.map(fold_map)
train_df.head()

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,...,aEgo_max,steeringAngleDeg_max,steeringTorque_max,gas_max,vEgo_min,aEgo_min,steeringAngleDeg_min,steeringTorque_min,gas_min,fold
0,00066be8e20318869c38c66be466631a_320,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,drive,...,1.538456,-2.165777,-44.0,0.25,5.701526,0.231099,-11.625697,-139.0,0.0,0
1,00066be8e20318869c38c66be466631a_420,11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,drive,...,1.538456,-2.165777,-44.0,0.25,5.701526,0.231099,-11.625697,-139.0,0.0,0
2,00066be8e20318869c38c66be466631a_520,10.472548,0.231099,-2.985105,-132.0,0.0,False,0.18,True,drive,...,1.538456,-2.165777,-44.0,0.25,5.701526,0.231099,-11.625697,-139.0,0.0,0
3,000fb056f97572d384bae4f5fc1e0f28_20,3.316744,1.276733,-31.725477,-114.0,0.0,False,0.255,True,drive,...,1.276733,7.632668,173.0,0.255,3.316744,-0.117775,-31.725477,-133.0,0.0,1
4,000fb056f97572d384bae4f5fc1e0f28_120,6.055565,-0.117775,7.632668,173.0,0.0,False,0.0,False,drive,...,1.276733,7.632668,173.0,0.255,3.316744,-0.117775,-31.725477,-133.0,0.0,1


In [18]:
PREDICT_TARGET_COLS = []
for target in ['x', 'y', 'z']:
    for i in range(6):
        train_df[f"predict_{target}_{i}"] = 0
        test_df[f"predict_{target}_{i}"] = 0
        PREDICT_TARGET_COLS.append(f"predict_{target}_{i}")

oof_dir = Path('/Users/gouyashuto/localrepository/atmacup18/output/baseline52/oof')

for i in range(5):
    train_df.loc[train_df.fold == i, PREDICT_TARGET_COLS] = np.load(oof_dir / f'fold{i}_val_pred.npy')

test_df[PREDICT_TARGET_COLS] = np.load(oof_dir.parent / 'test_preds.npy').mean(axis=0)

display(train_df.head())
display(test_df.head())

  train_df.loc[train_df.fold == i, PREDICT_TARGET_COLS] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  0.0491333 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, PREDICT_TARGET_COLS] = np.load(oof_dir / f'fold{i}_val_pred.npy')
 -0.00393677]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, PREDICT_TARGET_COLS] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  train_df.loc[train_df.fold == i, PREDICT_TARGET_COLS] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  0.16625977]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, PREDICT_TARGET_COLS] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  9.50622559e-03  1.07955933e-02]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, PREDICT_TARGET_COLS] = np.load(o

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,...,predict_y_2,predict_y_3,predict_y_4,predict_y_5,predict_z_0,predict_z_1,predict_z_2,predict_z_3,predict_z_4,predict_z_5
0,00066be8e20318869c38c66be466631a_320,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,drive,...,0.11908,13.445312,-0.180176,0.170654,17.4375,-0.27417,0.247559,21.609375,-0.366699,0.30957
1,00066be8e20318869c38c66be466631a_420,11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,drive,...,-0.001527,21.59375,0.06488,-0.029175,27.09375,0.1604,-0.030014,32.5,0.220825,-0.062866
2,00066be8e20318869c38c66be466631a_520,10.472548,0.231099,-2.985105,-132.0,0.0,False,0.18,True,drive,...,0.006664,20.96875,-0.312744,0.013977,26.421875,-0.476318,0.042175,31.921875,-0.647461,0.041534
3,000fb056f97572d384bae4f5fc1e0f28_20,3.316744,1.276733,-31.725477,-114.0,0.0,False,0.255,True,drive,...,-0.018555,11.25,0.53125,-0.030075,13.992188,0.75293,-0.018341,16.765625,1.049805,-0.028473
4,000fb056f97572d384bae4f5fc1e0f28_120,6.055565,-0.117775,7.632668,173.0,0.0,False,0.0,False,drive,...,0.030365,8.835938,-0.446533,0.0383,11.609375,-0.59668,0.058685,14.554688,-0.725586,0.096375


Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,...,predict_y_2,predict_y_3,predict_y_4,predict_y_5,predict_z_0,predict_z_1,predict_z_2,predict_z_3,predict_z_4,predict_z_5
0,012baccc145d400c896cb82065a93d42_120,3.374273,-0.01936,-34.008415,17.0,0.0,False,0.0,False,drive,...,0.004251,6.336719,0.040533,0.014632,7.878125,0.07298,0.019846,9.348437,0.122693,0.026746
1,012baccc145d400c896cb82065a93d42_220,2.441048,-0.022754,307.860077,295.0,0.0,True,0.0,False,drive,...,0.010564,3.069531,3.013672,0.023242,3.598438,4.29375,0.0193,4.1125,5.76875,0.0383
2,012baccc145d400c896cb82065a93d42_320,3.604152,-0.286239,10.774388,-110.0,0.0,True,0.0,False,drive,...,-0.023334,6.39375,-0.097316,-0.038749,7.826562,-0.16495,-0.051001,9.167188,-0.262469,-0.072753
3,012baccc145d400c896cb82065a93d42_420,2.048902,-0.537628,61.045235,189.0,0.0,True,0.0,False,drive,...,-0.034274,3.173828,1.094751,-0.057326,3.703516,1.642017,-0.087286,4.143359,2.27644,-0.123018
4,01d738e799d260a10f6324f78023b38f_120,2.201528,-1.8986,5.740093,-41.0,0.0,True,0.0,False,drive,...,-0.046704,2.311719,0.016077,-0.064423,2.382812,0.017225,-0.084155,2.349023,0.026129,-0.09798


In [20]:
remove_columns = ['ID']
X = train_df.drop(columns=remove_columns + TARGET_COLS)
y = train_df[TARGET_COLS]

test_X = test_df.drop(columns=remove_columns)

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[CAT_COLS] = oe.fit_transform(X[CAT_COLS])
test_X[CAT_COLS] = oe.transform(test_X[CAT_COLS])

print(X.shape)
print(test_X.shape)

(43371, 72)
(1727, 71)


In [30]:
%%time

models_dict = {}
oof = pd.DataFrame(np.zeros_like(train_df.loc[:, TARGET_COLS]), columns=TARGET_COLS)

preds = test_df[['ID']]
preds[TARGET_COLS] = 0.0

for target in TARGET_COLS:
    print('=' * 10, f'target: {target} start' + '=' * 10)
    partial_model, partial_oofs = train(X, y, target)
    models_dict[target] = partial_model
    oof.loc[:, target] = partial_oofs
    preds.loc[:, target] = predict(test_X, target, models_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[953]	valid_0's l1: 0.0613639
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1055]	valid_0's l1: 0.0620574
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[885]	valid_0's l1: 0.0610143
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[967]	valid_0's l1: 0.0633021
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[794]	valid_0's l1: 0.0626371
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[519]	valid_0's l1: 0.0314958
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[603]	valid_0's l1: 0.033364
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[738]	valid_0'

In [31]:
for target in TARGET_COLS:
    print(f'{target} CV score: ', evaluate(y[target], oof[target]))
score = evaluate(y, oof)
print('Total CV score: ', score)

x_0 CV score:  0.062074554152148013
y_0 CV score:  0.03248471437323804
z_0 CV score:  0.02575107968912595
x_1 CV score:  0.13222770293579364
y_1 CV score:  0.07250091376744082
z_1 CV score:  0.05312136022208847
x_2 CV score:  0.22486849353513497
y_2 CV score:  0.12737784175245384
z_2 CV score:  0.08105166465372789
x_3 CV score:  0.34928510624267084
y_3 CV score:  0.20539287589766156
z_3 CV score:  0.10977209776760731
x_4 CV score:  0.5025442274530052
y_4 CV score:  0.3102912403196514
z_4 CV score:  0.13927600344347943
x_5 CV score:  0.6776599466865915
y_5 CV score:  0.4435910867405169
z_5 CV score:  0.16977204743519841
Total CV score:  0.20661349761486295


In [32]:
submission = pd.DataFrame(origin_test_ids).merge(preds, on='ID', how='left').drop(columns=['ID'])

output_path = OUTPUT_DIR / f'{exp_name}_{score:.4f}_submission.csv'
if output_path.exists():
    assert False, f'output file already exists. {output_path}'

submission.to_csv(output_path, index=False)
submission

Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.455784,-0.052414,0.002809,3.018601,-0.108654,0.007316,4.483010,-0.141344,0.016730,5.885474,-0.155914,0.019541,7.367014,-0.113952,0.006709,8.693657,-0.003260,0.012242
1,0.934925,0.382405,0.001036,1.758698,1.036518,0.003828,2.402717,1.778358,0.009990,3.023882,2.484521,0.018533,3.803092,3.582791,0.023413,4.413394,4.746160,0.027725
2,1.591639,0.009090,-0.000446,3.283091,0.010032,-0.003324,4.866868,-0.005291,-0.004523,6.288420,-0.011536,-0.009494,7.696518,-0.045258,-0.019903,8.951873,-0.041451,-0.016332
3,0.820231,0.061564,-0.005490,1.641172,0.185296,-0.013182,2.424720,0.389960,-0.027412,2.975819,0.781373,-0.042406,3.629555,1.337223,-0.056979,4.451049,2.141549,-0.064447
4,0.814276,0.003094,-0.003113,1.434161,-0.012013,-0.010169,1.858006,-0.026539,-0.019155,2.235629,-0.001633,-0.024143,2.154182,0.005558,-0.042546,1.658647,0.003734,-0.053759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,6.545769,0.022153,0.005623,13.816203,0.099906,0.007764,21.071531,0.255377,-0.002726,28.340876,0.479784,-0.015406,35.592144,0.787775,-0.037673,43.125573,1.143024,-0.063932
1723,6.975885,0.004672,-0.006952,14.864170,0.003822,-0.046267,22.897432,0.001415,-0.104284,31.053862,-0.001159,-0.182103,39.254914,0.008279,-0.292423,47.534522,0.033680,-0.428858
1724,7.419897,0.001823,0.017816,15.679828,-0.008378,0.048305,23.904609,-0.020476,0.078439,32.050238,-0.062510,0.119835,40.112335,-0.111680,0.171338,48.055613,-0.153721,0.213097
1725,6.530330,0.004168,0.012173,13.685084,0.007419,0.065197,20.781985,0.035866,0.152102,27.839557,0.060909,0.285593,34.856052,0.097915,0.458288,41.893147,0.188924,0.645322
