In [7]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from os import path
import os
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import warnings
from statsmodels.tools.sm_exceptions import ValueWarning
warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


In [8]:
features = [
    "m_speed_stddev_480",
    "m_acceleration_stddev_480",
    "m_jerk_stddev_480",
    "m_steering_stddev_480",
    "AccelInput_stddev_480",
    "BrakeInput_stddev_480",
    "realtime steering entropy_1100",
    "realtime steering entropy_1100_stddev_480",
    "perclos",
]
TRAIN_DIR = 'dms_data/train/'
TEST_DIR = 'dms_data/test/'

In [9]:
import itertools

def generate_combinations(features, number): return [list(combination) for combination in itertools.combinations(features, number)]

# Example usage:
sample_features = ["feature1", "feature2", "feature3", "feature4"]
number = 2
combinations = generate_combinations(sample_features, number)
print(combinations)

[['feature1', 'feature2'], ['feature1', 'feature3'], ['feature1', 'feature4'], ['feature2', 'feature3'], ['feature2', 'feature4'], ['feature3', 'feature4']]


In [10]:
def solveOne(train_csv_path,test_csv_path,feature_columns,target,file_name=''):
  """
  :param train_csv_path: path to train csv file
  :param test_csv_path: path to test csv file
  :param feature_columns: list of feature to be used as input
  :param target: target dataFrame column
  :param file_name: name of the file to save the plot
  :return: None
  """
  # train_df = pd.read_csv(path.join(TRAIN_DIR, train_csv))
  # test_df = pd.read_csv(path.join(TEST_DIR, test_csv))
  train_df = pd.read_csv(train_csv_path)
  test_df = pd.read_csv(test_csv_path)

  train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
  test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
  train_df.set_index('timestamp', inplace=True)
  test_df.set_index('timestamp', inplace=True)

  # # 頻度追加
  # train_df = train_df.resample('20S')
  # test_df = test_df.resample('20S')

  
  feature_scaler = MinMaxScaler()
  target_scaler = MinMaxScaler()
  train_df[feature_columns] = feature_scaler.fit_transform(train_df[feature_columns])
  test_df[feature_columns] = feature_scaler.transform(test_df[feature_columns])
  train_df[target] = target_scaler.fit_transform(train_df[[target]])
  test_df[target] = target_scaler.transform(test_df[[target]])
  
  model = ARIMA(train_df[target], order=(5,1,0))
  model_fit = model.fit()
  predictions = model_fit.forecast(steps=len(test_df))
  rmse = np.sqrt(mean_squared_error(test_df[target], predictions))
  #print('Test RMSE: %.3f' % rmse)
  # plt.title(f'{target} over Time')
  # plt.xlabel('Timestamp')
  # plt.ylabel(target)
  # plt.plot(test_df.index, test_df[target], label='Actual')
  # plt.plot(test_df.index, predictions, label='Predicted', color='red')
  # plt.legend()
  # if file_name != '':
  #   plt.savefig(path.join('./figure', file_name))
  #   plt.show()
  return rmse

In [11]:
def calculate_average_rmse(train_paths, test_paths, features, target, file_name=''):
  rmses = []
  for train_csv, test_csv in zip(train_paths, test_paths):
    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)
    # タイムスタンプをインデックスに
    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
    train_df.set_index('timestamp', inplace=True)
    test_df.set_index('timestamp', inplace=True)
    # 正規化 
    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()
    train_df[features] = feature_scaler.fit_transform(train_df[features])
    test_df[features] = feature_scaler.transform(test_df[features])
    train_df[target] = target_scaler.fit_transform(train_df[[target]])
    test_df[target] = target_scaler.transform(test_df[[target]])
    model = ARIMA(train_df[target], order=(5, 1, 0))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=len(test_df))
    rmse = np.sqrt(mean_squared_error(test_df[target], predictions))
    rmses.append(rmse)
    # プロット
    # if file_name != '':
    #   plt.title(f'{target} over Time')
    #   plt.xlabel('Timestamp')
    #   plt.ylabel(target)
    #   plt.plot(test_df.index, test_df[target], label='Actual')
    #   plt.plot(test_df.index, predictions, label='Predicted', color='red')
    #   plt.legend()
    #   plt.savefig(path.join('./figure', f'{file_name}_{train_csv.split("/")[-1]}'))
    #   plt.clf()  # Clear the current figure

  average_rmse = np.mean(rmses)
  return average_rmse

In [12]:

""" 
target => 予測対象のカラム
"""
target = 'oss'
# target = 'Sleepiness'

csvs = os.listdir(TRAIN_DIR)

for i,csv in enumerate(csvs):
  if csv.endswith('y_train.csv'):
    #人ごとセクション
    train_path = path.join(TRAIN_DIR, csv)
    test_path = path.join(TEST_DIR, csv.replace('train', 'test'))
    list_rmse_with_feature_counts = []
    for ii in range(len(features)):
      #人+特徴量組み合わせ個数セクション
      feature_combinations = generate_combinations(features, ii + 1)
      results = []
      for feature_combination in feature_combinations:
        #人+特徴量組み合わせ個数+特徴量セクション
        rmse = solveOne(train_path,test_path,feature_combination,target,'')
        results.append(rmse)
      average = np.mean(results)
      list_rmse_with_feature_counts.append(average)
    
    column_names = ['変数の数','平均'] + list(map(lambda x: str(x), range(1, len(features) + 1)))
    row = ['ARIMA',str(np.mean(list_rmse_with_feature_counts))] + list(map(lambda rmse: '{:.3f}'.format(rmse), list_rmse_with_feature_counts))
    print(str(i))
    print(f"table:ARIMA_{target}-{str(i)}")
    print(' ' + '\t'.join(column_names))
    print(' ' + '\t'.join(row))
    print('------------------------')

# train_path_list = [path.join(TRAIN_DIR, csv) for csv in csvs if csv.endswith('y_train.csv')]
# test_path_list = [path.join(TEST_DIR, csv.replace('train', 'test')) for csv in csvs if csv.endswith('y_train.csv')]

# rmse = calculate_average_rmse(train_path_list,test_path_list, features, target)
# print(rmse)

0
table:ARIMA_oss-0
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.2673453073156865	0.267	0.267	0.267	0.267	0.267	0.267	0.267	0.267	0.267
------------------------
1
table:ARIMA_oss-1
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.13615373564892713	0.136	0.136	0.136	0.136	0.136	0.136	0.136	0.136	0.136
------------------------
2
table:ARIMA_oss-2
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.16461080397533617	0.165	0.165	0.165	0.165	0.165	0.165	0.165	0.165	0.165
------------------------
3
table:ARIMA_oss-3
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.10991002475812409	0.110	0.110	0.110	0.110	0.110	0.110	0.110	0.110	0.110
------------------------
4
table:ARIMA_oss-4
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.1409359669290415	0.141	0.141	0.141	0.141	0.141	0.141	0.141	0.141	0.141
------------------------
5
table:ARIMA_oss-5
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.13444635772636332	0.134	0.134	0.134	0.134	0.134	0.134	0.134	0.134	0.134
------------------------
6
table:ARIMA_oss-6
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.29452702518430574	0.295	0.29

```
0
table:ARIMA_Sleepiness-0
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.3076501898367987	0.308	0.308	0.308	0.308	0.308	0.308	0.308	0.308	0.308
------------------------
1
table:ARIMA_Sleepiness-1
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.0	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
------------------------
2
table:ARIMA_Sleepiness-2
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.1250240172467141	0.125	0.125	0.125	0.125	0.125	0.125	0.125	0.125	0.125
------------------------
3
table:ARIMA_Sleepiness-3
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.0	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
------------------------
4
table:ARIMA_Sleepiness-4
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.23264578616199694	0.233	0.233	0.233	0.233	0.233	0.233	0.233	0.233	0.233
------------------------
5
table:ARIMA_Sleepiness-5
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.26130150065867	0.261	0.261	0.261	0.261	0.261	0.261	0.261	0.261	0.261
------------------------
6
table:ARIMA_Sleepiness-6
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.38509829531684286	0.385	0.385	0.385	0.385	0.385	0.385	0.385	0.385	0.385
------------------------
7
table:ARIMA_Sleepiness-7
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.637414338416104	0.637	0.637	0.637	0.637	0.637	0.637	0.637	0.637	0.637
------------------------
8
table:ARIMA_Sleepiness-8
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.23523343301677732	0.235	0.235	0.235	0.235	0.235	0.235	0.235	0.235	0.235
------------------------
9
table:ARIMA_Sleepiness-9
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.511469021164515	0.511	0.511	0.511	0.511	0.511	0.511	0.511	0.511	0.511
------------------------
10
table:ARIMA_Sleepiness-10
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.1318678362156599	0.132	0.132	0.132	0.132	0.132	0.132	0.132	0.132	0.132
------------------------
11
table:ARIMA_Sleepiness-11
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.13550490661382836	0.136	0.136	0.136	0.136	0.136	0.136	0.136	0.136	0.136
------------------------
12
table:ARIMA_Sleepiness-12
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.5020677297677801	0.502	0.502	0.502	0.502	0.502	0.502	0.502	0.502	0.502
------------------------
13
table:ARIMA_Sleepiness-13
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.005727473001256247	0.006	0.006	0.006	0.006	0.006	0.006	0.006	0.006	0.006
------------------------
14
table:ARIMA_Sleepiness-14
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	1.3798026729982695e-18	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
------------------------
15
table:ARIMA_Sleepiness-15
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.10110015439446483	0.101	0.101	0.101	0.101	0.101	0.101	0.101	0.101	0.101
------------------------

```