In [1]:
from pathlib import Path

import polars as pl
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

In [2]:
exp_name = '002_stacking'

In [3]:
DATA_PATH = Path('/Users/gouyashuto/localrepository/atmacup18/input')
OUTPUT_DIR = Path('/Users/gouyashuto/localrepository/atmacup18/output')

In [4]:
train_df = pl.read_csv(DATA_PATH / 'train_features.csv')
test_df = pl.read_csv(DATA_PATH / 'test_features.csv')

print(train_df.shape)
print(test_df.shape)

(43371, 30)
(1727, 12)


In [5]:
CAT_COLS = ['gearShifter', 'scene']

TARGET_COLS = [
    'x_0', 'y_0', 'z_0',
    'x_1', 'y_1', 'z_1',
    'x_2', 'y_2', 'z_2',
    'x_3', 'y_3', 'z_3',
    'x_4', 'y_4', 'z_4',
    'x_5', 'y_5', 'z_5'
]

In [6]:
def get_agg_exprs(agg_cols) -> list[pl.Expr]:
    # 同一シーンから特徴量作成
    exprs = []
    exprs += [pl.col(agg_col).shift(-1).over("scene").alias(f"{agg_col}_shift-1") for agg_col in agg_cols] # 1ステップ前の時間の値
    exprs += [pl.col(agg_col).shift(1).over("scene").alias(f"{agg_col}_shift1") for agg_col in agg_cols] # 1ステップ後の時間の値
    exprs += [pl.col(agg_col).diff(-1).over("scene").alias(f"{agg_col}_diff-1") for agg_col in agg_cols] # 1ステップ前の時間の値との差分
    exprs += [pl.col(agg_col).diff(1).over("scene").alias(f"{agg_col}_diff1") for agg_col in agg_cols] # 1ステップ後の時間の値との差分
    exprs += [pl.col(agg_col).mean().over("scene").alias(f"{agg_col}_mean") for agg_col in agg_cols] # 同一シーンの平均値
    exprs += [pl.col(agg_col).std().over("scene").alias(f"{agg_col}_std") for agg_col in agg_cols] # 同一シーンの標準偏差
    exprs += [pl.col(agg_col).max().over("scene").alias(f"{agg_col}_max") for agg_col in agg_cols] # 同一シーンの最大値
    exprs += [pl.col(agg_col).min().over("scene").alias(f"{agg_col}_min") for agg_col in agg_cols] # 同一シーンの最小値
    return exprs

In [7]:
def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    agg_cols = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'gas'] # 同一シーンから集計する値のカラム名
    df = (
        df
        .with_columns(
            scene = pl.col('ID').str.split('_').list[0],
            decisecond = pl.col('ID').str.split('_').list[1].cast(pl.Int32),
        )
        .sort(['scene', 'decisecond'])
        .with_columns(get_agg_exprs(agg_cols))
    )
    return df

In [8]:
def train(X: pd.DataFrame, y: pd.DataFrame, target: str):
    lgb_params = {
        'objective': 'regression',
        'metric': 'mae',
        'learning_rate': 0.01,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1,
    }
    gkf = GroupKFold(n_splits=5)
    groups = X['scene']

    models = []
    oof = np.zeros(len(X))
    # for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y[target], groups=groups)):
    for fold in range(5):
        print('=' * 10, f'fold: {fold} start' + '=' * 10)
        train_X = X.loc[X.fold != fold].drop(columns=['scene', 'fold'])
        train_y = y[target].loc[X.fold != fold]
        valid_X = X.loc[X.fold == fold].drop(columns=['scene', 'fold'])
        valid_y = y[target].loc[X.fold == fold]

        train_data = lgb.Dataset(train_X, train_y)
        valid_data = lgb.Dataset(valid_X, valid_y, reference=train_data)

        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=10000,
            callbacks=[lgb.early_stopping(100)],
        )
        oof[X.fold == fold] = model.predict(valid_X)
        models.append(model)
    return models, oof

def predict(X: pd.DataFrame, models: list):
    preds = []
    for model in models:
        pred = model.predict(X.drop(columns=['scene']))
        preds.append(pred)
    return np.mean(preds, axis=0)

def evaluate(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    return np.mean(np.abs(y_true - y_pred))

In [9]:
train_df = preprocess(train_df)

origin_test_ids = test_df['ID'].to_pandas()
test_df = preprocess(test_df)

print(train_df.shape)
print(test_df.shape)

(43371, 72)
(1727, 54)


In [10]:
train_df = train_df.to_pandas()
test_df = test_df.to_pandas()

In [11]:
# Sceneの名前を抜き出してDataFrameに加える
# train_df["scene"] = [i.split("_")[0] for i in train_df.ID]

# =======================================
# 5 Foldの作成
# =======================================

# train/val split
# SceneでFoldを分ける
scene_list = train_df.scene.unique()
fold_map = {}

for i, scene in enumerate(scene_list):
    fold_map[scene] = i % 5

train_df["fold"] = train_df.scene.map(fold_map)
train_df.head()

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,...,aEgo_max,steeringAngleDeg_max,steeringTorque_max,gas_max,vEgo_min,aEgo_min,steeringAngleDeg_min,steeringTorque_min,gas_min,fold
0,00066be8e20318869c38c66be466631a_320,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,drive,...,1.538456,-2.165777,-44.0,0.25,5.701526,0.231099,-11.625697,-139.0,0.0,0
1,00066be8e20318869c38c66be466631a_420,11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,drive,...,1.538456,-2.165777,-44.0,0.25,5.701526,0.231099,-11.625697,-139.0,0.0,0
2,00066be8e20318869c38c66be466631a_520,10.472548,0.231099,-2.985105,-132.0,0.0,False,0.18,True,drive,...,1.538456,-2.165777,-44.0,0.25,5.701526,0.231099,-11.625697,-139.0,0.0,0
3,000fb056f97572d384bae4f5fc1e0f28_20,3.316744,1.276733,-31.725477,-114.0,0.0,False,0.255,True,drive,...,1.276733,7.632668,173.0,0.255,3.316744,-0.117775,-31.725477,-133.0,0.0,1
4,000fb056f97572d384bae4f5fc1e0f28_120,6.055565,-0.117775,7.632668,173.0,0.0,False,0.0,False,drive,...,1.276733,7.632668,173.0,0.255,3.316744,-0.117775,-31.725477,-133.0,0.0,1


In [12]:
predict_target_cols = []
for target in ['x', 'y', 'z']:
    for i in range(6):
        train_df[f"predict_{target}_{i}"] = 0
        test_df[f"predict_{target}_{i}"] = 0
        predict_target_cols.append(f"predict_{target}_{i}")

oof_dir = Path('/Users/gouyashuto/localrepository/atmacup18/output/baseline54/oof')

for i in range(5):
    train_df.loc[train_df.fold == i, predict_target_cols] = np.load(oof_dir / f'fold{i}_val_pred.npy')

test_df[predict_target_cols] = np.load(oof_dir.parent / 'test_preds.npy').mean(axis=0)

display(train_df.head())
display(test_df.head())

  train_df.loc[train_df.fold == i, predict_target_cols] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  0.06469727]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, predict_target_cols] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  0.00262451]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, predict_target_cols] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  8.5859375 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, predict_target_cols] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  0.24536133]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train_df.loc[train_df.fold == i, predict_target_cols] = np.load(oof_dir / f'fold{i}_val_pred.npy')
  0.01068878]' has dtype incompatible with int64, please explicitly cast to a compatibl

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,...,predict_y_2,predict_y_3,predict_y_4,predict_y_5,predict_z_0,predict_z_1,predict_z_2,predict_z_3,predict_z_4,predict_z_5
0,00066be8e20318869c38c66be466631a_320,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,drive,...,0.068604,13.40625,-0.106689,0.10083,17.359375,-0.148926,0.143799,21.453125,-0.210938,0.196533
1,00066be8e20318869c38c66be466631a_420,11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,drive,...,-0.032318,21.359375,-0.055603,-0.030151,26.75,-0.092957,-0.044159,32.03125,-0.11908,-0.042694
2,00066be8e20318869c38c66be466631a_520,10.472548,0.231099,-2.985105,-132.0,0.0,False,0.18,True,drive,...,0.029999,21.0,-0.316406,0.043518,26.359375,-0.461426,0.051147,31.796875,-0.666992,0.071838
3,000fb056f97572d384bae4f5fc1e0f28_20,3.316744,1.276733,-31.725477,-114.0,0.0,False,0.255,True,drive,...,-0.001828,11.46875,0.490967,-0.00396,14.25,0.70166,-0.001144,17.0625,0.941406,-0.007965
4,000fb056f97572d384bae4f5fc1e0f28_120,6.055565,-0.117775,7.632668,173.0,0.0,False,0.0,False,drive,...,0.047394,8.835938,-0.600586,0.087708,11.6875,-0.837891,0.123413,14.71875,-1.080078,0.146484


Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,...,predict_y_2,predict_y_3,predict_y_4,predict_y_5,predict_z_0,predict_z_1,predict_z_2,predict_z_3,predict_z_4,predict_z_5
0,012baccc145d400c896cb82065a93d42_120,3.374273,-0.01936,-34.008415,17.0,0.0,False,0.0,False,drive,...,-0.000943,6.353906,-0.014349,0.003911,7.903906,-0.018317,-0.004318,9.403125,0.010196,-0.011084
1,012baccc145d400c896cb82065a93d42_220,2.441048,-0.022754,307.860077,295.0,0.0,True,0.0,False,drive,...,-0.008749,3.152344,2.930469,-0.010068,3.711328,4.241016,-0.004283,4.189844,5.66875,-0.003876
2,012baccc145d400c896cb82065a93d42_320,3.604152,-0.286239,10.774388,-110.0,0.0,True,0.0,False,drive,...,-0.012073,6.279687,-0.085297,-0.018196,7.651562,-0.157382,-0.032669,8.921875,-0.220819,-0.049052
3,012baccc145d400c896cb82065a93d42_420,2.048902,-0.537628,61.045235,189.0,0.0,True,0.0,False,drive,...,0.00267,3.082422,0.855371,-0.010119,3.558203,1.234082,-0.017709,3.958203,1.660352,-0.017847
4,01d738e799d260a10f6324f78023b38f_120,2.201528,-1.8986,5.740093,-41.0,0.0,True,0.0,False,drive,...,-0.035065,2.25,0.018497,-0.052002,2.322266,0.019189,-0.074371,2.237109,0.025281,-0.075854


In [13]:
remove_columns = ['ID']
X = train_df.drop(columns=remove_columns + TARGET_COLS)
y = train_df[TARGET_COLS]

test_X = test_df.drop(columns=remove_columns)

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[CAT_COLS] = oe.fit_transform(X[CAT_COLS])
test_X[CAT_COLS] = oe.transform(test_X[CAT_COLS])

print(X.shape)
print(test_X.shape)

(43371, 72)
(1727, 71)


In [14]:
%%time

models_dict = {}
oof = pd.DataFrame(np.zeros_like(train_df.loc[:, TARGET_COLS]), columns=TARGET_COLS)

preds = test_df[['ID']]
preds[TARGET_COLS] = 0.0

for target in TARGET_COLS:
    print('=' * 10, f'target: {target} start' + '=' * 10)
    partial_model, partial_oofs = train(X, y, target)
    models_dict[target] = partial_model
    oof.loc[:, target] = partial_oofs
    preds.loc[:, target] = predict(test_X, models_dict[target])

Training until validation scores don't improve for 100 rounds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Early stopping, best iteration is:
[1252]	valid_0's l1: 0.0614224
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1000]	valid_0's l1: 0.0621864
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1007]	valid_0's l1: 0.0609351
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[895]	valid_0's l1: 0.0634526
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[820]	valid_0's l1: 0.0628582
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[494]	valid_0's l1: 0.0316008
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[474]	valid_0's l1: 0.0333271
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[445]	valid_0's l1: 0.03266
Training until validation scores don't improv

In [15]:
for target in TARGET_COLS:
    print(f'{target} CV score: ', evaluate(y[target], oof[target]))
score = evaluate(y, oof)
print('Total CV score: ', score)

x_0 CV score:  0.06217037856447635
y_0 CV score:  0.0324585178364432
z_0 CV score:  0.025760442318709362
x_1 CV score:  0.13245106719065416
y_1 CV score:  0.07260173964934723
z_1 CV score:  0.05324038719709783
x_2 CV score:  0.22502233678350134
y_2 CV score:  0.12737605980149372
z_2 CV score:  0.08132268089846585
x_3 CV score:  0.34863587687772374
y_3 CV score:  0.20464932762347157
z_3 CV score:  0.11013090456574288
x_4 CV score:  0.5011708589925226
y_4 CV score:  0.3083421540223798
z_4 CV score:  0.13953710773846428
x_5 CV score:  0.6779857974184866
y_5 CV score:  0.4401948075123917
z_5 CV score:  0.17015409281225097
Total CV score:  0.20628914098909015


In [16]:
submission = pd.DataFrame(origin_test_ids).merge(preds, on='ID', how='left').drop(columns=['ID'])

output_path = OUTPUT_DIR / f'{exp_name}_{score:.4f}_submission.csv'
if output_path.exists():
    assert False, f'output file already exists. {output_path}'

submission.to_csv(output_path, index=False)
submission

Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.463991,-0.049669,0.002574,3.024819,-0.103862,0.002377,4.549850,-0.115094,-0.000744,5.933706,-0.153886,-0.000478,7.405761,-0.063780,-0.004364,8.763233,-0.061580,-0.007451
1,0.944088,0.378684,-0.000358,1.768910,0.998596,-0.000870,2.410631,1.677103,0.000880,2.998322,2.490040,-0.001049,3.662633,3.327970,0.002954,4.266144,4.585862,0.004271
2,1.591044,0.013495,0.002318,3.318072,0.017580,0.002774,4.932899,0.008146,0.001528,6.350708,0.041607,0.001049,7.819269,-0.006925,-0.004068,9.307925,-0.072504,-0.010551
3,0.822369,0.065218,0.000896,1.631042,0.211284,-0.000766,2.378557,0.440283,-0.004066,2.998722,0.760857,-0.002977,3.582831,1.288276,-0.001650,4.188897,1.783152,-0.005913
4,0.825304,0.002796,-0.003021,1.414790,-0.011968,-0.007510,1.935306,-0.012879,-0.016259,2.300793,-0.005911,-0.022407,2.151273,0.003463,-0.047799,1.657511,0.007600,-0.053745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,6.544531,0.026155,0.013026,13.822683,0.104220,0.023885,21.092918,0.263541,0.026514,28.394687,0.508450,0.033328,35.687118,0.845035,0.024753,43.249729,1.242502,0.019806
1723,6.980958,0.003513,-0.007516,14.877855,-0.006219,-0.042671,22.899712,0.003107,-0.102094,31.006377,-0.017649,-0.183467,39.189364,-0.059877,-0.276454,47.422392,-0.063989,-0.382804
1724,7.419286,0.001503,0.010735,15.684197,-0.011242,0.026853,23.913254,-0.015924,0.045822,32.052825,-0.034763,0.072825,40.114396,-0.088948,0.097638,48.051192,-0.112241,0.117564
1725,6.538223,0.003831,0.006158,13.697659,0.009413,0.040417,20.787937,0.034394,0.117537,27.832282,0.073148,0.232250,34.737154,0.111879,0.373379,41.765041,0.195893,0.559160
