In [None]:
import os
import sys
from pathlib import Path

In [None]:
# HOMEディレクトリ設定(環境に合わせて変更してください)
if "google.colab" in sys.modules:
    # Google Colaboratoryの場合
    HOME = Path("/content/drive/MyDrive/signate/NEDOG")

    # Google Driveをマウント
    if not os.path.exists("/content/drive"):
        from google.colab import drive
        drive.mount("/content/drive")
else:
    # それ以外
    HOME = Path("..")

# INPUT/WORKINGディレクトリ設定
INPUT = HOME / "input"
WORKING = HOME / "working"

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [None]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

# データ読み取り

In [None]:
tr_emg_df = pd.read_pickle(WORKING / 'prep2_tr_emg.pickle')
tr_vel_df = pd.read_pickle(WORKING / 'prep2_tr_vel.pickle')
ts_emg_df = pd.read_pickle(WORKING / 'prep2_ts_emg.pickle')

In [None]:
# スロープ上り下り区別用フラグ(均等にするため閾値は-10とする)
tr_dir_df = tr_vel_df.groupby(['subject','trial'])['vel_x'].apply(
    lambda x: x.sum() > -10
).rename('dir').reset_index()
tr_vel_df = tr_vel_df.merge(tr_dir_df, on=['subject','trial'])

# 学習/推論用

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'random_state': 41,
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',

    #'lambda_l1': 0.5103247316877605, 'lambda_l2': 0.015010593003038865,
    #'num_leaves': 58, 'feature_fraction': 0.474875745099162,
    #'bagging_fraction': 0.8775197693430815, 'bagging_freq': 9, 'min_child_samples': 22
    'lambda_l1': 1.600428398027849, 'lambda_l2': 0.00011273908593669964,
    'num_leaves': 152, 'feature_fraction': 0.3711806552896912,
    'bagging_fraction': 0.9654375223160317, 'bagging_freq': 7, 'min_child_samples': 171
}

In [None]:
# holdoutで学習/推論
def train_holdout(x_train, y_train, x_valid, y_valid, params):
    data_train = lgb.Dataset(data=x_train, label=y_train)
    data_valid = lgb.Dataset(data=x_valid, label=y_valid)

    model = lgb.train(
        params, data_train, valid_sets=[data_valid],
        num_boost_round=10000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False)
        ]
    )
    pred = model.predict(x_valid)

    return pred, model

In [None]:
# cross validationで学習/推論(学習専用データを設定可能)
def train_cv(x_cross, y_cross, x_train, y_train, group, params, n=5):
    models = []
    oof = np.zeros(len(y_cross), dtype=np.float64)
    kf = GroupKFold(n_splits=n)
    for idx_train, idx_valid in tqdm(kf.split(x_cross, y_cross, group),total=n):
        pred, model = train_holdout(
            pd.concat([x_cross.iloc[idx_train], x_train], axis=0),
            pd.concat([y_cross.iloc[idx_train], y_train], axis=0),
            x_cross.iloc[idx_valid],
            y_cross.iloc[idx_valid],
            params
        )
        models.append(model)
        oof[idx_valid] = pred
    return oof, models

In [None]:
# スロープ上り下り分離CVによる学習/推論
def train_predict(tr_emg_df, tr_vel_df, ts_emg_df, target_col, drop_cols, params, n_loop=1):
    fd = tr_vel_df['dir']
    f5 = tr_emg_df['subject'] == 5
    f6 = tr_emg_df['subject'] == 6

    train_cols = tr_emg_df.columns[~tr_emg_df.columns.isin(drop_cols)]
    tr_pred = 0
    ts_pred = 0
    models = []
    for i in tqdm(range(n_loop)):
        seed_everything(41+i)
        params['random_state'] = 41+i

        tr_pred_sub = np.zeros(len(tr_vel_df), dtype=np.float64)
        models_sub = []

        # cross validation (上り)
        f = fd & ~f5 & ~f6
        group   = tr_emg_df.loc[ f, 'trial']
        x_cross = tr_emg_df.loc[ f, train_cols]
        y_cross = tr_vel_df.loc[ f, target_col]
        x_train = tr_emg_df.loc[~f, train_cols]
        y_train = tr_vel_df.loc[~f, target_col]
        pred, models1 = train_cv(x_cross, y_cross, x_train, y_train, group, params)
        tr_pred_sub[f] = pred
        models_sub += models1

        # cross validation (下り)
        f = ~fd & ~f5 & ~f6
        group   = tr_emg_df.loc[ f, 'trial']
        x_cross = tr_emg_df.loc[ f, train_cols]
        y_cross = tr_vel_df.loc[ f, target_col]
        x_train = tr_emg_df.loc[~f, train_cols]
        y_train = tr_vel_df.loc[~f, target_col]
        pred, models2 = train_cv(x_cross, y_cross, x_train, y_train, group, params)
        tr_pred_sub[f] = pred
        models_sub += models2

        x_train = tr_emg_df[train_cols]
        y_train = tr_vel_df[target_col]

        # holdout (リファレンスtrain側を予測)
        x_valid = x_train[f5].copy()
        x_valid['subject'] = 6
        y_valid = y_train[f5]
        pred, model = train_holdout(x_train[~f5], y_train[~f5], x_valid, y_valid, params)
        tr_pred_sub[f5] = pred
        models_sub.append(model)

        # holdout (リファレンスtest側を予測)
        x_valid = x_train[f6].copy()
        x_valid['subject'] = 5
        y_valid = y_train[f6]
        pred, model = train_holdout(x_train[~f6], y_train[~f6], x_valid, y_valid, params)
        tr_pred_sub[f6] = pred
        models_sub.append(model)

        # 全モデルでtestデータ推論
        x_test = ts_emg_df[train_cols]
        ts_pred_sub = 0
        for model in models_sub:
            ts_pred_sub += model.predict(x_test.values)
        ts_pred_sub /= len(models_sub)

        print(mean_squared_error(tr_vel_df[target_col].values, tr_pred_sub, squared=False))
        tr_pred += tr_pred_sub / n_loop
        ts_pred += ts_pred_sub / n_loop
        models += models_sub
    return tr_pred, ts_pred, models

## 各加速度の学習/推論

In [None]:
n_loop = 1
target_cols = ['acc_z','acc_x','acc_y','acc_r','acc_s','acc_c','acc_e']
drop_cols = ['trial','time']

In [None]:
tr_preds = []
ts_preds = []
model_dict = {}
for target_col in target_cols:
    # 学習/推論
    tr_pred, ts_pred, models = train_predict(
        tr_emg_df, tr_vel_df, ts_emg_df,
        target_col, drop_cols, params, n_loop=n_loop
    )

    # スコア計算
    f = tr_vel_df['subject'] <= 4
    score1 = mean_squared_error(tr_vel_df.loc[f, target_col].values, tr_pred[f], squared=False)
    f = tr_vel_df['subject'] == 5
    score2 = mean_squared_error(tr_vel_df.loc[f, target_col].values, tr_pred[f], squared=False)
    f = tr_vel_df['subject'] == 6
    score3 = mean_squared_error(tr_vel_df.loc[f, target_col].values, tr_pred[f], squared=False)
    print(target_col, score1, score2, score3, (score1 + score2 + score3) / 3)

    # z軸推論値は特徴量に追加
    if target_col[-1] == 'z':
        tr_emg_df[f'pred_{target_col}'] = tr_pred
        ts_emg_df[f'pred_{target_col}'] = ts_pred

    # 結果格納
    tr_preds.append(pd.Series(tr_pred, name=f'pred_{target_col}'))
    ts_preds.append(pd.Series(ts_pred, name=f'pred_{target_col}'))
    model_dict[target_col] = models

# 保存

In [None]:
tr_pred_df = pd.concat(tr_preds, axis=1)
tr_pred_df.to_pickle(WORKING / 'tr_pred_acc.pickle')
ts_pred_df = pd.concat(ts_preds, axis=1)
ts_pred_df.to_pickle(WORKING / 'ts_pred_acc.pickle')

In [None]:
import joblib
with open(WORKING / 'lgbm_acc.joblib', 'wb') as f:
    joblib.dump(model_dict, f, compress=4)