In [1]:
from pathlib import Path

import polars as pl
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

In [15]:
exp_name = '000_LightGBM_Baseline'

In [16]:
DATA_PATH = Path('/Users/gouyashuto/localrepository/atmacup18/input')
OUTPUT_DIR = Path('/Users/gouyashuto/localrepository/atmacup18/output')

In [3]:
train_df = pl.read_csv(DATA_PATH / 'train_features.csv')
test_df = pl.read_csv(DATA_PATH / 'test_features.csv')

print(train_df.shape)
print(test_df.shape)

(43371, 30)
(1727, 12)


In [4]:
CAT_COLS = ['gearShifter', 'scene']

TARGET_COLS = [
    'x_0', 'y_0', 'z_0',
    'x_1', 'y_1', 'z_1',
    'x_2', 'y_2', 'z_2',
    'x_3', 'y_3', 'z_3',
    'x_4', 'y_4', 'z_4',
    'x_5', 'y_5', 'z_5'
]

In [5]:
def preprocess(df):
    df = (
        df
        .with_columns(
            scene = pl.col('ID').str.split('_').list[0],
            decisecond = pl.col('ID').str.split('_').list[1].cast(pl.Int32),
        )
        .sort(['scene', 'decisecond'])
    )
    return df

In [None]:
def train(X: pd.DataFrame, y: pd.DataFrame, target: str):
    lgb_params = {
        'objective': 'regression',
        'metric': 'mae',
        'learning_rate': 0.01,
        'verbosity': -1,
        'boosting_type': 'gbdt',
    }
    gkf = GroupKFold(n_splits=5)
    groups = X['scene']

    models = []
    oof = np.zeros(len(X))
    for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y[target], groups=groups)):
        print('=' * 10, f'fold: {fold} start' + '=' * 10)
        train_X = X.iloc[train_idx].drop(columns=['scene'])
        train_y = y[target].iloc[train_idx]
        valid_X = X.iloc[valid_idx].drop(columns=['scene'])
        valid_y = y[target].iloc[valid_idx]

        train_data = lgb.Dataset(train_X, train_y)
        valid_data = lgb.Dataset(valid_X, valid_y, reference=train_data)

        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=10000,
            callbacks=[lgb.early_stopping(100)],
        )
        oof[valid_idx] = model.predict(valid_X)
        models.append(model)
    return models, oof

def predict(X: pd.DataFrame, models: list):
    preds = []
    for model in models:
        pred = model.predict(X.drop(columns=['scene']))
        preds.append(pred)
    return np.mean(preds, axis=0)

def evaluate(y_true: pd.DataFrame, y_pred: pd.DataFrame):
    return np.mean(np.abs(y_true - y_pred))

In [None]:
train_df = preprocess(train_df)

origin_test_ids = test_df['ID'].to_pandas()
test_df = preprocess(test_df)

print(train_df.shape)
print(test_df.shape)

(43371, 32)
(1727, 14)


In [8]:
remove_columns = ['ID']
X = train_df.drop(remove_columns + TARGET_COLS).to_pandas()
y = train_df[TARGET_COLS].to_pandas()

test_X = test_df.drop(remove_columns).to_pandas()

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[CAT_COLS] = oe.fit_transform(X[CAT_COLS])
test_X[CAT_COLS] = oe.transform(test_X[CAT_COLS])

print(X.shape)
print(test_X.shape)

(43371, 13)
(1727, 13)


In [None]:
%%time

models_dict = {}
oof = pd.DataFrame(np.zeros_like(train_df.to_pandas().loc[:, TARGET_COLS]), columns=TARGET_COLS)

preds = test_df.select(['ID']).to_pandas()
preds[TARGET_COLS] = 0.0

for target in TARGET_COLS:
    partial_model, partial_oofs = train(X, y, target)
    models_dict[target] = partial_model
    oof.loc[:, target] = partial_oofs
    preds.loc[:, target] = predict(test_X, models_dict[target])

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[823]	valid_0's l1: 0.062273
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[899]	valid_0's l1: 0.0617193
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[838]	valid_0's l1: 0.0641437
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[930]	valid_0's l1: 0.0631051
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[951]	valid_0's l1: 0.0637588


  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[701]	valid_0's l1: 0.0328458
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[503]	valid_0's l1: 0.0321552
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1342]	valid_0's l1: 0.0332671
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[725]	valid_0's l1: 0.0324673
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[584]	valid_0's l1: 0.0338363


 -0.26226476]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[186]	valid_0's l1: 0.025886
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[157]	valid_0's l1: 0.0255933
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[140]	valid_0's l1: 0.0265234
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[125]	valid_0's l1: 0.0266375
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[157]	valid_0's l1: 0.0258953
Training until validation scores don't improve for 100 rounds


  0.00230236]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Early stopping, best iteration is:
[1077]	valid_0's l1: 0.133798
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1202]	valid_0's l1: 0.135923
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[953]	valid_0's l1: 0.137433
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1123]	valid_0's l1: 0.135908
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1015]	valid_0's l1: 0.139197


 12.34634783]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1217]	valid_0's l1: 0.0738015
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[918]	valid_0's l1: 0.0725206
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1481]	valid_0's l1: 0.0749747
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1137]	valid_0's l1: 0.0737289
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1518]	valid_0's l1: 0.0759161


 -2.53302983e-02 -1.04648143e+00]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[214]	valid_0's l1: 0.0538536
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[167]	valid_0's l1: 0.0530817
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[292]	valid_0's l1: 0.0548321
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[246]	valid_0's l1: 0.0556545
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[137]	valid_0's l1: 0.0541282
Training until validation scores don't improve for 100 rounds


  0.00174012]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Early stopping, best iteration is:
[1225]	valid_0's l1: 0.236013
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1163]	valid_0's l1: 0.240886
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1162]	valid_0's l1: 0.24018
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1080]	valid_0's l1: 0.238516
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1636]	valid_0's l1: 0.243707


 18.92785425]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1401]	valid_0's l1: 0.130643
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1809]	valid_0's l1: 0.130903
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1672]	valid_0's l1: 0.133246
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1940]	valid_0's l1: 0.131807
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1949]	valid_0's l1: 0.135624


 -2.31021473]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[276]	valid_0's l1: 0.0829358
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[232]	valid_0's l1: 0.0816756
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[350]	valid_0's l1: 0.0842818
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[317]	valid_0's l1: 0.0863294
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[166]	valid_0's l1: 0.0834996
Training until validation scores don't improve for 100 rounds


 -0.01057301]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Early stopping, best iteration is:
[1334]	valid_0's l1: 0.387776
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1140]	valid_0's l1: 0.398178
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1227]	valid_0's l1: 0.393824
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1690]	valid_0's l1: 0.389379
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1436]	valid_0's l1: 0.399168


 25.53528076]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1705]	valid_0's l1: 0.215081
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1489]	valid_0's l1: 0.22065
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2156]	valid_0's l1: 0.220773
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2196]	valid_0's l1: 0.217298
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2187]	valid_0's l1: 0.225527


 -3.89871598]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[307]	valid_0's l1: 0.113166
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[279]	valid_0's l1: 0.111733
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[289]	valid_0's l1: 0.115108
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[228]	valid_0's l1: 0.117838
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[256]	valid_0's l1: 0.114685
Training until validation scores don't improve for 100 rounds


 -0.04213585]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Early stopping, best iteration is:
[1140]	valid_0's l1: 0.598432
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1339]	valid_0's l1: 0.616804
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1617]	valid_0's l1: 0.609888
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1453]	valid_0's l1: 0.602955
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1329]	valid_0's l1: 0.61716


 32.1096969 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1863]	valid_0's l1: 0.333173
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2245]	valid_0's l1: 0.344661
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1432]	valid_0's l1: 0.344161
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2621]	valid_0's l1: 0.335905
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2459]	valid_0's l1: 0.353716


 -5.72855611]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[372]	valid_0's l1: 0.144888
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[343]	valid_0's l1: 0.143368
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[378]	valid_0's l1: 0.14682
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[262]	valid_0's l1: 0.150693
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[277]	valid_0's l1: 0.14658
Training until validation scores don't improve for 100 rounds


 -0.10300619]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Early stopping, best iteration is:
[1213]	valid_0's l1: 0.875111
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1270]	valid_0's l1: 0.902381
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1488]	valid_0's l1: 0.890718
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1248]	valid_0's l1: 0.884178
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1428]	valid_0's l1: 0.897833


 38.65951898]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1623]	valid_0's l1: 0.486914
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1761]	valid_0's l1: 0.507105
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1591]	valid_0's l1: 0.504832
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2225]	valid_0's l1: 0.492796
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2604]	valid_0's l1: 0.519211


 -7.81367377]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[325]	valid_0's l1: 0.178323
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[265]	valid_0's l1: 0.176518
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[417]	valid_0's l1: 0.180154
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[248]	valid_0's l1: 0.184379
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[347]	valid_0's l1: 0.17988
score:  0.23560903466107674


 -0.20651298]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  preds.loc[:, target] = predict(test_X, models_dict[target])


In [11]:
score = evaluate(y, oof)
print('score: ', score)

score:  0.23560903466107674


In [None]:
submission = origin_test_ids.merge(preds, on='ID', how='left').drop(columns=['ID'])

output_path = OUTPUT_DIR / f'{exp_name}_{score:.4f}_submission.csv'
if output_path.exists():
    assert False, f'output file already exists. {output_path}'

submission.to_csv(output_path, index=False)
submission

Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.483098,-0.037628,0.001439,3.070651,-0.099265,0.002788,4.644811,-0.170444,0.004198,6.140696,-0.160234,0.003523,7.680331,-0.166526,-0.001924,9.158489,-0.206351,-0.005739
1,0.935086,0.371088,-0.004857,1.767716,0.964232,-0.007054,2.483191,1.671986,-0.009255,2.997268,2.400676,-0.005252,3.450068,3.144871,-0.004438,3.799330,4.075434,0.000321
2,1.581826,0.015210,-0.002270,3.219301,0.000026,-0.006010,4.718982,-0.031805,-0.006594,5.994392,-0.079238,-0.010221,7.252539,-0.190913,-0.012636,8.271206,-0.251401,-0.017434
3,0.848449,0.051132,-0.005602,1.689315,0.171241,-0.013302,2.455276,0.409664,-0.020986,3.042121,0.559997,-0.034485,3.604864,0.932005,-0.045013,4.172381,1.392722,-0.058532
4,0.807686,0.005897,-0.009614,1.396199,0.010147,-0.029545,1.824539,0.008374,-0.048114,2.048646,0.003790,-0.059028,2.284738,-0.008458,-0.067366,2.687371,-0.018751,-0.073950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,6.532050,0.009760,0.009523,13.802336,0.057810,0.024284,21.054861,0.127567,0.036722,28.303162,0.232416,0.048121,35.563692,0.352936,0.057296,42.883746,0.478469,0.064899
1723,6.989153,-0.006101,0.005675,14.863966,-0.028863,0.012322,22.834665,-0.068895,0.024924,30.851378,-0.120367,0.042035,38.894378,-0.173865,0.056587,46.979140,-0.231984,0.073843
1724,7.412916,-0.009427,0.002280,15.685702,-0.044183,0.004153,23.975849,-0.102993,0.005562,32.272264,-0.186150,0.008323,40.578334,-0.290964,0.014866,48.879203,-0.390984,0.028306
1725,6.515652,-0.010433,0.000348,13.655015,-0.025330,-0.002561,20.729315,-0.036080,-0.006318,27.735371,-0.048588,-0.008640,34.706597,-0.057194,-0.009394,41.629582,-0.068385,-0.011226
