In [65]:
from pathlib import Path

import polars as pl
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

In [66]:
exp_name = '001_LightGBM_Baseline'

In [67]:
DATA_PATH = Path('/Users/gouyashuto/localrepository/atmacup18/input')
OUTPUT_DIR = Path('/Users/gouyashuto/localrepository/atmacup18/output')

In [68]:
train_df = pl.read_csv(DATA_PATH / 'train_features.csv')
test_df = pl.read_csv(DATA_PATH / 'test_features.csv')

print(train_df.shape)
print(test_df.shape)

(43371, 30)
(1727, 12)


In [69]:
CAT_COLS = ['gearShifter', 'scene']

TARGET_COLS = [
    'x_0', 'y_0', 'z_0',
    'x_1', 'y_1', 'z_1',
    'x_2', 'y_2', 'z_2',
    'x_3', 'y_3', 'z_3',
    'x_4', 'y_4', 'z_4',
    'x_5', 'y_5', 'z_5'
]

In [70]:
def preprocess(df):
    df = (
        df
        .with_columns(
            scene = pl.col('ID').str.split('_').list[0],
            decisecond = pl.col('ID').str.split('_').list[1].cast(pl.Int32),
        )
        .sort(['scene', 'decisecond'])
    )
    return df

In [71]:
def train(X: pd.DataFrame, y: pd.DataFrame, target: str):
    lgb_params = {
        'objective': 'regression',
        'metric': 'mae',
        'learning_rate': 0.01,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_jobs': -1,
    }
    gkf = GroupKFold(n_splits=5)
    groups = X['scene']

    models = []
    oof = np.zeros(len(X))
    for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y[target], groups=groups)):
        print('=' * 10, f'fold: {fold} start' + '=' * 10)
        train_X = X.iloc[train_idx].drop(columns=['scene'])
        train_y = y[target].iloc[train_idx]
        valid_X = X.iloc[valid_idx].drop(columns=['scene'])
        valid_y = y[target].iloc[valid_idx]

        train_data = lgb.Dataset(train_X, train_y)
        valid_data = lgb.Dataset(valid_X, valid_y, reference=train_data)

        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=10000,
            callbacks=[lgb.early_stopping(100)],
        )
        oof[valid_idx] = model.predict(valid_X)
        models.append(model)
    return models, oof

def predict(X: pd.DataFrame, models: list):
    preds = []
    for model in models:
        pred = model.predict(X.drop(columns=['scene']))
        preds.append(pred)
    return np.mean(preds, axis=0)

def evaluate(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    return np.mean(np.abs(y_true - y_pred))

In [72]:
train_df = preprocess(train_df)

origin_test_ids = test_df['ID'].to_pandas()
test_df = preprocess(test_df)

print(train_df.shape)
print(test_df.shape)

(43371, 32)
(1727, 14)


In [73]:
remove_columns = ['ID']
X = train_df.drop(remove_columns + TARGET_COLS).to_pandas()
y = train_df[TARGET_COLS].to_pandas()

test_X = test_df.drop(remove_columns).to_pandas()

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[CAT_COLS] = oe.fit_transform(X[CAT_COLS])
test_X[CAT_COLS] = oe.transform(test_X[CAT_COLS])

print(X.shape)
print(test_X.shape)

(43371, 13)
(1727, 13)


In [74]:
%%time

models_dict = {}
oof = pd.DataFrame(np.zeros_like(train_df.to_pandas().loc[:, TARGET_COLS]), columns=TARGET_COLS)

preds = test_df.select(['ID']).to_pandas()
preds[TARGET_COLS] = 0.0

for target in TARGET_COLS:
    print('=' * 10, f'target: {target} start' + '=' * 10)
    partial_model, partial_oofs = train(X, y, target)
    models_dict[target] = partial_model
    oof.loc[:, target] = partial_oofs
    preds.loc[:, target] = predict(test_X, models_dict[target])

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[823]	valid_0's l1: 0.062273
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[899]	valid_0's l1: 0.0617193
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[838]	valid_0's l1: 0.0641437
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[930]	valid_0's l1: 0.0631051
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[951]	valid_0's l1: 0.0637588
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[701]	valid_0's l1: 0.0328458
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[503]	valid_0's l1: 0.0321552
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1342]	valid_0'

In [75]:
score = evaluate(y, oof)
print('score: ', score)

score:  0.23560903466107674


In [76]:
for target in ['x', 'y', 'z']:
    rolling_targets = [col for col in TARGET_COLS if col.startswith(f'{target}_')]
    fix_targets = [f'{target}_{i}' for i in range(1, 5)]
    oof.loc[:, fix_targets] = oof[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]
score = evaluate(y, oof)
print('score: ', score)

  oof.loc[:, fix_targets] = oof[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]
  oof.loc[:, fix_targets] = oof[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]
  oof.loc[:, fix_targets] = oof[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]


score:  0.2361669591610403


In [77]:
for target in ['x', 'y', 'z']:
    rolling_targets = [col for col in TARGET_COLS if col.startswith(f'{target}_')]
    fix_targets = [f'{target}_{i}' for i in range(1, 5)]
    preds.loc[:, fix_targets] = preds[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]

  preds.loc[:, fix_targets] = preds[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]
  preds.loc[:, fix_targets] = preds[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]
  preds.loc[:, fix_targets] = preds[rolling_targets].rolling(window=3, axis=1, center=True).mean().loc[:, fix_targets]


In [78]:
submission = pd.DataFrame(origin_test_ids).merge(preds, on='ID', how='left').drop(columns=['ID'])

output_path = OUTPUT_DIR / f'{exp_name}_{score:.4f}_submission.csv'
if output_path.exists():
    assert False, f'output file already exists. {output_path}'

submission.to_csv(output_path, index=False)
submission

Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.483098,-0.037628,0.001439,3.066187,-0.102446,0.002809,4.618719,-0.143314,0.003503,6.155280,-0.165735,0.001932,7.659839,-0.177704,-0.001380,9.158489,-0.206351,-0.005739
1,0.935086,0.371088,-0.004857,1.728665,1.002435,-0.007055,2.416058,1.678965,-0.007187,2.976842,2.405844,-0.006315,3.415555,3.206994,-0.003123,3.799330,4.075434,0.000321
2,1.581826,0.015210,-0.002270,3.173370,-0.005523,-0.004958,4.644225,-0.037006,-0.007609,5.988637,-0.100652,-0.009817,7.172712,-0.173851,-0.013430,8.271206,-0.251401,-0.017434
3,0.848449,0.051132,-0.005602,1.664347,0.210679,-0.013297,2.395571,0.380301,-0.022924,3.034087,0.633889,-0.033495,3.606455,0.961574,-0.046010,4.172381,1.392722,-0.058532
4,0.807686,0.005897,-0.009614,1.342808,0.008139,-0.029091,1.756461,0.007437,-0.045562,2.052641,0.001235,-0.058169,2.340252,-0.007807,-0.066782,2.687371,-0.018751,-0.073950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,6.532050,0.009760,0.009523,13.796416,0.065046,0.023510,21.053453,0.139264,0.036375,28.307238,0.237640,0.047380,35.583533,0.354607,0.056772,42.883746,0.478469,0.064899
1723,6.989153,-0.006101,0.005675,14.895928,-0.034620,0.014307,22.850003,-0.072708,0.026427,30.860140,-0.121042,0.041182,38.908299,-0.175405,0.057488,46.979140,-0.231984,0.073843
1724,7.412916,-0.009427,0.002280,15.691489,-0.052201,0.003998,23.977938,-0.111109,0.006013,32.275482,-0.193369,0.009584,40.576600,-0.289366,0.017165,48.879203,-0.390984,0.028306
1725,6.515652,-0.010433,0.000348,13.633327,-0.023948,-0.002844,20.706567,-0.036666,-0.005840,27.723761,-0.047287,-0.008117,34.690517,-0.058056,-0.009753,41.629582,-0.068385,-0.011226
