In [15]:
import os
import pandas as pd
import numpy as np
import itertools
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam



def read_and_process(file):
  df = pd.read_csv(file)
  df = df.drop(['timestamp'], axis=1)
  df = df.dropna()  # 無効値を含む行を削除
  return df


def train_model(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  train_data = lgb.Dataset(X_train, label=y_train)
  eval_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

  params = {
      'objective': 'regression',
      'metric': 'rmse',
  }
  model = lgb.train(params, train_data, valid_sets=eval_data)
  return model, X_test, y_test


# 訓練データを処理
train_data_path = './train'
train_data_path = './dms_data/train'
files = os.listdir(train_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(train_data_path, file))
    df_list.append(df)

# データを結合
train_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_oss = train_df.drop(['oss', 'Sleepiness'], axis=1)
y_oss = train_df['oss']

# 特徴量と目的変数に分割 (Sleepiness)
X_sleepiness = train_df.drop(['oss', 'Sleepiness'], axis=1)
y_sleepiness = train_df['Sleepiness']

# モデルの学習 (oss)
model_oss, X_test_oss, y_test_oss = train_model(X_oss, y_oss)

# モデルの学習 (Sleepiness)
model_sleepiness, X_test_sleepiness, y_test_sleepiness = train_model(X_sleepiness, y_sleepiness)

# テストデータを処理
test_data_path = './test'
test_data_path = './dms_data/test'
files = os.listdir(test_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(test_data_path, file))
    df_list.append(df)

# データを結合
test_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_test_oss = test_df.drop(['oss', 'Sleepiness'], axis=1)
y_test_oss = test_df['oss']

# 特徴量と目的変数に分割 (Sleepiness)
X_test_sleepiness = test_df.drop(['oss', 'Sleepiness'], axis=1)
y_test_sleepiness = test_df['Sleepiness']

# モデルの評価 (oss)
oss_preds = model_oss.predict(X_test_oss)
oss_rmse = np.sqrt(mean_squared_error(y_test_oss, oss_preds))
print(f"RMSE for oss model: {oss_rmse}")

# モデルの評価 (Sleepiness)
sleepiness_preds = model_sleepiness.predict(X_test_sleepiness)
sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, sleepiness_preds))
print(f"RMSE for sleepiness model: {sleepiness_rmse}")


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5042
[LightGBM] [Info] Number of data points in the train set: 1390, number of used features: 22
[LightGBM] [Info] Start training from score 3.031342
[1]	valid_0's rmse: 0.655825
[2]	valid_0's rmse: 0.61167
[3]	valid_0's rmse: 0.574543
[4]	valid_0's rmse: 0.542637
[5]	valid_0's rmse: 0.516908
[6]	valid_0's rmse: 0.493443
[7]	valid_0's rmse: 0.470399
[8]	valid_0's rmse: 0.450852
[9]	valid_0's rmse: 0.436194
[10]	valid_0's rmse: 0.423399
[11]	valid_0's rmse: 0.413296
[12]	valid_0's rmse: 0.402987
[13]	valid_0's rmse: 0.393276
[14]	valid_0's rmse: 0.387414
[15]	valid_0's rmse: 0.380685
[16]	valid_0's rmse: 0.375928
[17]	valid_0's rmse: 0.369497
[18]	valid_0's rmse: 0.366759
[19]	valid_0's rmse: 0.360905
[20]	valid_0's rmse: 0.356972
[21]	valid_0's rmse: 0.354674
[22]	valid_0's rmse: 0.350765
[23]	valid_0's rmse: 0.348549
[24]	valid_0's rmse: 0.346686
[25]	valid_0's rmse: 0.343834
[26]	valid_0's rmse: 0.

In [16]:


# 線形回帰モデルの学習 (oss)
lin_reg_oss = LinearRegression().fit(X_oss, y_oss)
lin_reg_oss_preds = lin_reg_oss.predict(X_test_oss)
lin_reg_oss_rmse = np.sqrt(mean_squared_error(y_test_oss, lin_reg_oss_preds))
print(f"RMSE for Linear Regression (oss) model: {lin_reg_oss_rmse}")

# 線形回帰モデルの学習 (Sleepiness)
lin_reg_sleepiness = LinearRegression().fit(X_sleepiness, y_sleepiness)
lin_reg_sleepiness_preds = lin_reg_sleepiness.predict(X_test_sleepiness)
lin_reg_sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, lin_reg_sleepiness_preds))
print(f"RMSE for Linear Regression (Sleepiness) model: {lin_reg_sleepiness_rmse}")

# ニューラルネットワークモデルの構造定義


def create_nn_model(input_shape):
  model = Sequential()
  model.add(Dense(32, activation='relu', input_shape=(input_shape,)))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer=Adam())
  return model


# ニューラルネットワークモデルの学習 (oss)
nn_model_oss = create_nn_model(X_oss.shape[1])
nn_model_oss.fit(X_oss, y_oss, epochs=10, verbose=0)
nn_oss_preds = nn_model_oss.predict(X_test_oss).flatten()
nn_oss_rmse = np.sqrt(mean_squared_error(y_test_oss, nn_oss_preds))
print(f"RMSE for Neural Network (oss) model: {nn_oss_rmse}")

# ニューラルネットワークモデルの学習 (Sleepiness)
nn_model_sleepiness = create_nn_model(X_sleepiness.shape[1])
nn_model_sleepiness.fit(X_sleepiness, y_sleepiness, epochs=10, verbose=0)
nn_sleepiness_preds = nn_model_sleepiness.predict(X_test_sleepiness).flatten()
nn_sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, nn_sleepiness_preds))
print(f"RMSE for Neural Network (Sleepiness) model: {nn_sleepiness_rmse}")


RMSE for Linear Regression (oss) model: 0.5743600270704748
RMSE for Linear Regression (Sleepiness) model: 1.5044710625022029
RMSE for Neural Network (oss) model: 0.6801930484509798
RMSE for Neural Network (Sleepiness) model: 1.7776257144502214


In [17]:

train = pd.read_csv('./dms_data/train/20201126_1546_0_y_train.csv').drop(columns=['timestamp']).dropna();
test = pd.read_csv('./dms_data/test/20201126_1546_0_y_test.csv').drop(columns=['timestamp']).dropna();

# 特徴量のリスト
features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
            'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
            'm_jerk_var_480', 'm_jerk_stddev_480']

# 特徴量の全ての組み合わせを生成
feature_combinations = []
for r in range(1, len(features) + 1):
  for subset in itertools.combinations(features, r):
    feature_combinations.append(list(subset))

# RMSEと特徴量の組み合わせを記録するテキストファイルを開く
with open("feature_selection_results.txt", "w") as f:
  # 各特徴量の組み合わせについて
  for feature_comb in feature_combinations:
    # モデルの訓練
    lgb_data = lgb.Dataset(train[feature_comb], label=train['oss'])
    lgb_model = lgb.train({}, lgb_data)

    # テストデータでの予測
    preds = lgb_model.predict(test[feature_comb])

    # RMSEの計算
    rmse = np.sqrt(mean_squared_error(test['oss'], preds))

    # テキストファイルに結果を出力
    f.write(f"Features: {feature_comb}, RMSE: {rmse}\n")


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM]

### L1正規化と総当たりの比較

In [18]:
from sklearn.linear_model import LassoCV

# 特徴量の全ての組み合わせについてのRMSEを格納するリスト
rmses = []

# 各特徴量の組み合わせについて
for feature_comb in feature_combinations:
  # モデルの訓練
  lgb_data = lgb.Dataset(train[feature_comb], label=train['oss'])
  lgb_model = lgb.train({'verbose':-1}, lgb_data)

  # テストデータでの予測
  preds = lgb_model.predict(test[feature_comb])

  # RMSEの計算
  rmse = np.sqrt(mean_squared_error(test['oss'], preds))
  rmses.append(rmse)
  print(f"Features: {feature_comb}, RMSE: {rmse}")

# 最も小さいRMSEを持つ組み合わせを取得
best_comb = feature_combinations[np.argmin(rmses)]
print(f"Best feature combination: {best_comb}")

# Lassoによる特徴量選択
lasso = LassoCV(cv=5).fit(train[features], train['oss'])

# 非ゼロの係数を持つ特徴量を取得
selected_features = np.array(features)[lasso.coef_ != 0]
print(f"Selected features by Lasso: {selected_features}")


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 110, number of used features: 1
[LightGBM] [Info] Start training from score 2.462952
You can set `force_row_wise=true` to remove the overhead.
And if mem