In [14]:
import os
import pandas as pd
import numpy as np
import itertools
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [15]:


dms_dir = './dms_data'


def read_and_process(file):
  df = pd.read_csv(file)
  df = df.drop(['timestamp'], axis=1)
  df = df.dropna()  # 無効値を含む行を削除
  return df

def train_model(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  train_data = lgb.Dataset(X_train, label=y_train)
  eval_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
  params = {
      'objective': 'regression',
      'metric': 'rmse',
      'verbose':-1
  }
  model = lgb.train(params, train_data, valid_sets=eval_data)
  return model, X_test, y_test


# 訓練データを処理
train_data_path = './train'
train_data_path = dms_dir + '/train'
#train_data_path = './dms_data/train'
files = os.listdir(train_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(train_data_path, file))
    df_list.append(df)

# データを結合
train_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_oss = train_df.drop(['oss', 'Sleepiness'], axis=1)
y_oss = train_df['oss']

# 特徴量と目的変数に分割 (Sleepiness)
X_sleepiness = train_df.drop(['oss', 'Sleepiness'], axis=1)
y_sleepiness = train_df['Sleepiness']

y_sleepiness.describe()


count    1738.000000
mean        5.676982
std         1.858004
min         1.000000
25%         4.000000
50%         5.794879
75%         7.000000
max         9.000000
Name: Sleepiness, dtype: float64

In [17]:
# モデルの学習 (oss)
model_oss, X_test_oss, y_test_oss = train_model(X_oss, y_oss)

# モデルの学習 (Sleepiness)
model_sleepiness, X_test_sleepiness, y_test_sleepiness = train_model(X_sleepiness, y_sleepiness)

# テストデータを処理
test_data_path = './test'
test_data_path = dms_dir + '/test'
#test_data_path = './dms_data/test'
files = os.listdir(test_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(test_data_path, file))
    df_list.append(df)

# データを結合
test_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_test_oss = test_df.drop(['oss', 'Sleepiness'], axis=1)
y_test_oss = test_df['oss']

# 特徴量と目的変数に分割 (Sleepiness)
X_test_sleepiness = test_df.drop(['oss', 'Sleepiness'], axis=1)
y_test_sleepiness = test_df['Sleepiness']

# 目的変数の中身を表示。

y_test_oss.head()
# モデルの評価 (oss)
oss_preds = model_oss.predict(X_test_oss)
oss_rmse = np.sqrt(mean_squared_error(y_test_oss, oss_preds))
print(f"RMSE for oss model: {oss_rmse}")

# モデルの評価 (Sleepiness)
sleepiness_preds = model_sleepiness.predict(X_test_sleepiness)
sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, sleepiness_preds))
print(f"RMSE for sleepiness model: {sleepiness_rmse}")

[1]	valid_0's rmse: 0.644743
[2]	valid_0's rmse: 0.60529
[3]	valid_0's rmse: 0.572186
[4]	valid_0's rmse: 0.54065
[5]	valid_0's rmse: 0.518605
[6]	valid_0's rmse: 0.497177
[7]	valid_0's rmse: 0.475992
[8]	valid_0's rmse: 0.461166
[9]	valid_0's rmse: 0.44502
[10]	valid_0's rmse: 0.431144
[11]	valid_0's rmse: 0.422217
[12]	valid_0's rmse: 0.412459
[13]	valid_0's rmse: 0.402994
[14]	valid_0's rmse: 0.394118
[15]	valid_0's rmse: 0.388302
[16]	valid_0's rmse: 0.382783
[17]	valid_0's rmse: 0.377448
[18]	valid_0's rmse: 0.372986
[19]	valid_0's rmse: 0.367962
[20]	valid_0's rmse: 0.363507
[21]	valid_0's rmse: 0.361
[22]	valid_0's rmse: 0.356155
[23]	valid_0's rmse: 0.352299
[24]	valid_0's rmse: 0.348463
[25]	valid_0's rmse: 0.345106
[26]	valid_0's rmse: 0.344176
[27]	valid_0's rmse: 0.342893
[28]	valid_0's rmse: 0.34163
[29]	valid_0's rmse: 0.339506
[30]	valid_0's rmse: 0.337213
[31]	valid_0's rmse: 0.336244
[32]	valid_0's rmse: 0.334792
[33]	valid_0's rmse: 0.332411
[34]	valid_0's rmse: 0.331

In [4]:


# 線形回帰モデルの学習 (oss)
lin_reg_oss = LinearRegression().fit(X_oss, y_oss)
lin_reg_oss_preds = lin_reg_oss.predict(X_test_oss)
lin_reg_oss_rmse = np.sqrt(mean_squared_error(y_test_oss, lin_reg_oss_preds))
print(f"RMSE for Linear Regression (oss) model: {lin_reg_oss_rmse}")

# 線形回帰モデルの学習 (Sleepiness)
lin_reg_sleepiness = LinearRegression().fit(X_sleepiness, y_sleepiness)
lin_reg_sleepiness_preds = lin_reg_sleepiness.predict(X_test_sleepiness)
lin_reg_sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, lin_reg_sleepiness_preds))
print(f"RMSE for Linear Regression (Sleepiness) model: {lin_reg_sleepiness_rmse}")

# ニューラルネットワークモデルの構造定義


def create_nn_model(input_shape):
  model = Sequential()
  model.add(Dense(32, activation='relu', input_shape=(input_shape,)))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer=Adam())
  return model


# ニューラルネットワークモデルの学習 (oss)
nn_model_oss = create_nn_model(X_oss.shape[1])
nn_model_oss.fit(X_oss, y_oss, epochs=10, verbose=0)
nn_oss_preds = nn_model_oss.predict(X_test_oss).flatten()
nn_oss_rmse = np.sqrt(mean_squared_error(y_test_oss, nn_oss_preds))
print(f"RMSE for Neural Network (oss) model: {nn_oss_rmse}")

# ニューラルネットワークモデルの学習 (Sleepiness)
nn_model_sleepiness = create_nn_model(X_sleepiness.shape[1])
nn_model_sleepiness.fit(X_sleepiness, y_sleepiness, epochs=10, verbose=0)
nn_sleepiness_preds = nn_model_sleepiness.predict(X_test_sleepiness).flatten()
nn_sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, nn_sleepiness_preds))
print(f"RMSE for Neural Network (Sleepiness) model: {nn_sleepiness_rmse}")


RMSE for Linear Regression (oss) model: 0.5743600270704748
RMSE for Linear Regression (Sleepiness) model: 1.5044710625022029
RMSE for Neural Network (oss) model: 0.6997902481804317
RMSE for Neural Network (Sleepiness) model: 1.801208759965754


In [5]:

train = pd.read_csv(dms_dir + '/train/20201126_1546_0_y_train.csv').drop(columns=['timestamp']).dropna();
test = pd.read_csv(dms_dir + '/test/20201126_1546_0_y_test.csv').drop(columns=['timestamp']).dropna();

# 特徴量のリスト
features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
            'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
            'm_jerk_var_480', 'm_jerk_stddev_480']

# 特徴量の全ての組み合わせを生成
feature_combinations = []
for r in range(1, len(features) + 1):
  for subset in itertools.combinations(features, r):
    feature_combinations.append(list(subset))

# RMSEと特徴量の組み合わせを記録するテキストファイルを開く
with open("feature_selection_results.txt", "w") as f:
  # 各特徴量の組み合わせについて
  for feature_comb in feature_combinations:
    # モデルの訓練
    lgb_data = lgb.Dataset(train[feature_comb], label=train['oss'])
    lgb_model = lgb.train({'verbose':-1}, lgb_data)
    # テストデータでの予測
    preds = lgb_model.predict(test[feature_comb])
    # RMSEの計算
    rmse = np.sqrt(mean_squared_error(test['oss'], preds))
    # テキストファイルに結果を出力
    f.write(f"Features: {feature_comb}, RMSE: {rmse}\n")


### L1正規化と総当たりの比較

In [6]:
from sklearn.linear_model import LassoCV

# 特徴量の全ての組み合わせについてのRMSEを格納するリスト
rmses = []

# 各特徴量の組み合わせについて
for feature_comb in feature_combinations:
  # モデルの訓練
  lgb_data = lgb.Dataset(train[feature_comb], label=train['oss'])
  lgb_model = lgb.train({'verbose':-1}, lgb_data)

  # テストデータでの予測
  preds = lgb_model.predict(test[feature_comb])

  # RMSEの計算
  rmse = np.sqrt(mean_squared_error(test['oss'], preds))
  rmses.append(rmse)

# 最も小さいRMSEを持つ組み合わせを取得
best_comb = feature_combinations[np.argmin(rmses)]
print(f"Best feature combination: {best_comb}")

# Lassoによる特徴量選択
lasso = LassoCV(cv=5).fit(train[features], train['oss'])

# 非ゼロの係数を持つ特徴量を取得
selected_features = np.array(features)[lasso.coef_ != 0]
print(f"Selected features by Lasso: {selected_features}")


Best feature combination: ['m_speed_var_480', 'm_acceleration', 'm_jerk', 'm_jerk_var_480']
Selected features by Lasso: ['m_speed' 'm_speed_var_480' 'm_speed_stddev_480']


### oss,sleepinessの正規化忘れてた。

In [7]:
from sklearn.preprocessing import MinMaxScaler

import os
import pandas as pd
import numpy as np
import itertools
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

def read_and_process(file):
  df = pd.read_csv(file)
  df = df.drop(['timestamp'], axis=1)
  df = df.dropna()  # 無効値を含む行を削除
  return df

def train_model(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  train_data = lgb.Dataset(X_train, label=y_train)
  eval_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
  params = {
      'objective': 'regression',
      'metric': 'rmse',
      'verbose':-1
  }
  model = lgb.train(params, train_data, valid_sets=eval_data)
  return model, X_test, y_test

def normalize_data(series):
  scaler = MinMaxScaler()
  series = series.values.reshape(-1, 1) 
  series = scaler.fit_transform(series)
  series = series.ravel()
  return pd.Series(series)

In [8]:
# 訓練データを処理
train_data_path = './train'
train_data_path = dms_dir + '/train'
#train_data_path = './dms_data/train'
files = os.listdir(train_data_path)
print(files)

['20201127_1840_5_y_train.csv', '20201210_1610_6_y_train.csv', '20201130_1808_6_y_train.csv', '20201130_1122_5_y_train.csv', '20201210_1354_2_y_train.csv', '20201201_1429_5_y_train.csv', '20201203_1022_7_y_train.csv', '20201201_1230_0_y_train.csv', '20201127_1701_7_y_train.csv', '20201210_1112_2_y_train.csv', '20201127_1548_2_y_train.csv', '20201203_1244_5_y_train.csv', '20201201_1555_0_y_train.csv', '20201127_1432_7_y_train.csv', '20201126_1546_0_y_train.csv', '20201203_1404_6_y_train.csv']


In [9]:
# 訓練データを処理
train_data_path = './train'
train_data_path = dms_dir + '/train'
#train_data_path = './dms_data/train'
files = os.listdir(train_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(train_data_path, file))
    df_list.append(df)

# データを結合
train_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_oss = train_df.drop(['oss', 'Sleepiness'], axis=1)
y_oss = train_df['oss']

# 特徴量と目的変数に分割 (Sleepiness)
X_sleepiness = train_df.drop(['oss', 'Sleepiness'], axis=1)
y_sleepiness = train_df['Sleepiness']

# 正規化
y_oss = normalize_data(y_oss)
y_sleepiness = normalize_data(y_sleepiness)

In [10]:

# モデルの学習 (oss)
model_oss, X_test_oss, y_test_oss = train_model(X_oss, y_oss)

# モデルの学習 (Sleepiness)
model_sleepiness, X_test_sleepiness, y_test_sleepiness = train_model(X_sleepiness, y_sleepiness)

# テストデータを処理
test_data_path = './test'
test_data_path = dms_dir + '/test'
#test_data_path = './dms_data/test'
files = os.listdir(test_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(test_data_path, file))
    df_list.append(df)

# データを結合
test_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_test_oss = test_df.drop(['oss', 'Sleepiness'], axis=1)
y_test_oss = test_df['oss']

# 特徴量と目的変数に分割 (Sleepiness)
X_test_sleepiness = test_df.drop(['oss', 'Sleepiness'], axis=1)
y_test_sleepiness = test_df['Sleepiness']

# Normalize y_test_oss and y_test_sleepiness
y_test_oss = normalize_data(y_test_oss)
y_test_sleepiness = normalize_data(y_test_sleepiness)

# モデルの評価 (oss)
oss_preds = model_oss.predict(X_test_oss)
oss_rmse = np.sqrt(mean_squared_error(y_test_oss, oss_preds))
print(f"RMSE for oss model: {oss_rmse}")

# モデルの評価 (Sleepiness)
sleepiness_preds = model_sleepiness.predict(X_test_sleepiness)
sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, sleepiness_preds))
print(f"RMSE for sleepiness model: {sleepiness_rmse}")



[1]	valid_0's rmse: 0.205158
[2]	valid_0's rmse: 0.191345
[3]	valid_0's rmse: 0.179731
[4]	valid_0's rmse: 0.16975
[5]	valid_0's rmse: 0.161701
[6]	valid_0's rmse: 0.154361
[7]	valid_0's rmse: 0.147152
[8]	valid_0's rmse: 0.141038
[9]	valid_0's rmse: 0.136452
[10]	valid_0's rmse: 0.132449
[11]	valid_0's rmse: 0.129289
[12]	valid_0's rmse: 0.126064
[13]	valid_0's rmse: 0.123026
[14]	valid_0's rmse: 0.121193
[15]	valid_0's rmse: 0.119088
[16]	valid_0's rmse: 0.117599
[17]	valid_0's rmse: 0.115588
[18]	valid_0's rmse: 0.114731
[19]	valid_0's rmse: 0.1129
[20]	valid_0's rmse: 0.11167
[21]	valid_0's rmse: 0.110951
[22]	valid_0's rmse: 0.109728
[23]	valid_0's rmse: 0.109035
[24]	valid_0's rmse: 0.108452
[25]	valid_0's rmse: 0.10756
[26]	valid_0's rmse: 0.106843
[27]	valid_0's rmse: 0.106037
[28]	valid_0's rmse: 0.105461
[29]	valid_0's rmse: 0.104787
[30]	valid_0's rmse: 0.104753
[31]	valid_0's rmse: 0.104229
[32]	valid_0's rmse: 0.104028
[33]	valid_0's rmse: 0.103023
[34]	valid_0's rmse: 0.1

In [11]:
# 線形回帰モデルの学習 (oss)
lin_reg_oss = LinearRegression().fit(X_oss, y_oss)
lin_reg_oss_preds = lin_reg_oss.predict(X_test_oss)
lin_reg_oss_rmse = np.sqrt(mean_squared_error(y_test_oss, lin_reg_oss_preds))
print(f"RMSE for Linear Regression (oss) model: {lin_reg_oss_rmse}")

# 線形回帰モデルの学習 (Sleepiness)
lin_reg_sleepiness = LinearRegression().fit(X_sleepiness, y_sleepiness)
lin_reg_sleepiness_preds = lin_reg_sleepiness.predict(X_test_sleepiness)
lin_reg_sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, lin_reg_sleepiness_preds))
print(f"RMSE for Linear Regression (Sleepiness) model: {lin_reg_sleepiness_rmse}")

# ニューラルネットワークモデルの構造定義


def create_nn_model(input_shape):
  model = Sequential()
  model.add(Dense(32, activation='relu', input_shape=(input_shape,)))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer=Adam())
  return model


# ニューラルネットワークモデルの学習 (oss)
nn_model_oss = create_nn_model(X_oss.shape[1])
nn_model_oss.fit(X_oss, y_oss, epochs=10, verbose=0)
nn_oss_preds = nn_model_oss.predict(X_test_oss).flatten()
nn_oss_rmse = np.sqrt(mean_squared_error(y_test_oss, nn_oss_preds))
print(f"RMSE for Neural Network (oss) model: {nn_oss_rmse}")

# ニューラルネットワークモデルの学習 (Sleepiness)
nn_model_sleepiness = create_nn_model(X_sleepiness.shape[1])
nn_model_sleepiness.fit(X_sleepiness, y_sleepiness, epochs=10, verbose=0)
nn_sleepiness_preds = nn_model_sleepiness.predict(X_test_sleepiness).flatten()
nn_sleepiness_rmse = np.sqrt(mean_squared_error(y_test_sleepiness, nn_sleepiness_preds))
print(f"RMSE for Neural Network (Sleepiness) model: {nn_sleepiness_rmse}")

RMSE for Linear Regression (oss) model: 0.16824343373994186
RMSE for Linear Regression (Sleepiness) model: 0.18805888281277539
RMSE for Neural Network (oss) model: 0.3483422187259162
RMSE for Neural Network (Sleepiness) model: 0.28829864419794843


In [12]:
#Log output on/off during learning
#t  = True
t = False

#正規化してossのみLightGBM一つのセルにまとめる
def read_and_process(file):
  df = pd.read_csv(file)
  df = df.drop(['timestamp'], axis=1)
  df = df.dropna()  # 無効値を含む行を削除
  return df

def train_model(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  train_data = lgb.Dataset(X_train, label=y_train)
  eval_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
  params = {
      'objective': 'regression',
      'metric': 'rmse',
      'verbose':-1
  }
  model = None
  if(t):
    model = lgb.train(params, train_data, valid_sets=eval_data)
  else:
    model = lgb.train(params, train_data)
  return model, X_test, y_test

def normalize_data(series):
  scaler = MinMaxScaler()
  series = series.values.reshape(-1, 1) 
  series = scaler.fit_transform(series)
  series = series.ravel()
  return pd.Series(series)

# 訓練データを処理
train_data_path = './train'
train_data_path = dms_dir + '/train'
#train_data_path = './dms_data/train'
files = os.listdir(train_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(train_data_path, file))
    df_list.append(df)

# データを結合
train_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_oss = train_df.drop(['oss', 'Sleepiness'], axis=1)
y_oss = train_df['oss']
y_oss = normalize_data(y_oss)

model_oss, X_test_oss, y_test_oss = train_model(X_oss, y_oss)

# テストデータを処理
test_data_path = './test'
test_data_path = dms_dir + '/test'
#test_data_path = './dms_data/test'
files = os.listdir(test_data_path)

df_list = []
for file in files:
  if file.endswith('.csv'):
    df = read_and_process(os.path.join(test_data_path, file))
    df_list.append(df)

# データを結合
test_df = pd.concat(df_list, ignore_index=True)

# 特徴量と目的変数に分割 (oss)
X_test_oss = test_df.drop(['oss', 'Sleepiness'], axis=1)
y_test_oss = test_df['oss']

# Normalize y_test_oss and y_test_sleepiness
y_test_oss = normalize_data(y_test_oss)

# モデルの評価 (oss)
oss_preds = model_oss.predict(X_test_oss)
oss_rmse = np.sqrt(mean_squared_error(y_test_oss, oss_preds))
print(f"RMSE for oss model: {oss_rmse}")

RMSE for oss model: 0.1674825679650757
