In [15]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from os import path
import os
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import warnings
from statsmodels.tools.sm_exceptions import ValueWarning

warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


In [16]:
features = [
    "m_speed_stddev_480",
    "m_acceleration_stddev_480",
    "m_jerk_stddev_480",
    "m_steering_stddev_480",
    "AccelInput_stddev_480",
    "BrakeInput_stddev_480",
    "realtime steering entropy_1100",
    "realtime steering entropy_1100_stddev_480",
    "perclos",
]
TRAIN_DIR = 'dms_data/train/'
TEST_DIR = 'dms_data/test/'

In [17]:
import itertools

def generate_combinations(features, number): return [list(combination) for combination in itertools.combinations(features, number)]

# Example usage:
sample_features = ["feature1", "feature2", "feature3", "feature4"]
number = 2
combinations = generate_combinations(sample_features, number)
print(combinations)

[['feature1', 'feature2'], ['feature1', 'feature3'], ['feature1', 'feature4'], ['feature2', 'feature3'], ['feature2', 'feature4'], ['feature3', 'feature4']]


In [19]:
def solveOne(train_csv_path,test_csv_path,feature_columns,target,file_name=''):
  """
  :param train_csv_path: path to train csv file
  :param test_csv_path: path to test csv file
  :param feature_columns: list of feature to be used as input
  :param target: target dataFrame column
  :param file_name: name of the file to save the plot
  :return: None
  """
  # train_df = pd.read_csv(path.join(TRAIN_DIR, train_csv))
  # test_df = pd.read_csv(path.join(TEST_DIR, test_csv))
  train_df = pd.read_csv(train_csv_path)
  test_df = pd.read_csv(test_csv_path)

  train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
  test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
  train_df.set_index('timestamp', inplace=True)
  test_df.set_index('timestamp', inplace=True)

  # # 頻度追加
  # train_df = train_df.resample('20S')
  # test_df = test_df.resample('20S')

  
  feature_scaler = MinMaxScaler()
  target_scaler = MinMaxScaler()
  train_df[feature_columns] = feature_scaler.fit_transform(train_df[feature_columns])
  test_df[feature_columns] = feature_scaler.transform(test_df[feature_columns])
  train_df[target] = target_scaler.fit_transform(train_df[[target]])
  test_df[target] = target_scaler.transform(test_df[[target]])
  
  model = ARIMA(train_df[target], order=(5,1,0))
  model_fit = model.fit()
  predictions = model_fit.forecast(steps=len(test_df))
  rmse = np.sqrt(mean_squared_error(test_df[target], predictions))
  #print('Test RMSE: %.3f' % rmse)
  # plt.title(f'{target} over Time')
  # plt.xlabel('Timestamp')
  # plt.ylabel(target)
  # plt.plot(test_df.index, test_df[target], label='Actual')
  # plt.plot(test_df.index, predictions, label='Predicted', color='red')
  # plt.legend()
  # if file_name != '':
  #   plt.savefig(path.join('./figure', file_name))
  #   plt.show()
  return rmse

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.arima.model import ARIMA
from os import path


def calculate_average_rmse(train_paths, test_paths, features, target, file_name=''):
  rmses = []

  for train_csv, test_csv in zip(train_paths, test_paths):
    train_df = pd.read_csv(train_csv)
    test_df = pd.read_csv(test_csv)

    train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
    test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
    train_df.set_index('timestamp', inplace=True)
    test_df.set_index('timestamp', inplace=True)

    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()

    train_df[features] = feature_scaler.fit_transform(train_df[features])
    test_df[features] = feature_scaler.transform(test_df[features])
    train_df[target] = target_scaler.fit_transform(train_df[[target]])
    test_df[target] = target_scaler.transform(test_df[[target]])

    model = ARIMA(train_df[target], order=(5, 1, 0))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=len(test_df))
    rmse = np.sqrt(mean_squared_error(test_df[target], predictions))
    rmses.append(rmse)

    # if file_name != '':
    #   plt.title(f'{target} over Time')
    #   plt.xlabel('Timestamp')
    #   plt.ylabel(target)
    #   plt.plot(test_df.index, test_df[target], label='Actual')
    #   plt.plot(test_df.index, predictions, label='Predicted', color='red')
    #   plt.legend()
    #   plt.savefig(path.join('./figure', f'{file_name}_{train_csv.split("/")[-1]}'))
    #   plt.clf()  # Clear the current figure

  average_rmse = np.mean(rmses)
  return average_rmse

In [21]:

""" 
target => 予測対象のカラム
"""
target = 'oss'
target = 'Sleepiness'

csvs = os.listdir(TRAIN_DIR)
for i,csv in enumerate(csvs):
  if csv.endswith('y_train.csv'):
    #人ごとセクション
    train_path = path.join(TRAIN_DIR, csv)
    test_path = path.join(TEST_DIR, csv.replace('train', 'test'))
    list_rmse_with_feature_counts = []
    for ii in range(len(features)):
      #人+特徴量組み合わせ個数セクション
      feature_combinations = generate_combinations(features, ii + 1)
      results = []
      for feature_combination in feature_combinations:
        #人+特徴量組み合わせ個数+特徴量セクション
        rmse = solveOne(train_path,test_path,feature_combination,target,'')
        results.append(rmse)
      average = np.mean(results)
      list_rmse_with_feature_counts.append(average)
    
    column_names = ['変数の数','平均'] + list(map(lambda x: str(x), range(1, len(features) + 1)))
    row = ['ARIMA',str(np.mean(list_rmse_with_feature_counts))] + list(map(lambda rmse: '{:.3f}'.format(rmse), list_rmse_with_feature_counts))
    print(str(i))
    print(f"table:ARIMA_{target}-{str(i)}")
    print(' ' + '\t'.join(column_names))
    print(' ' + '\t'.join(row))
    print('------------------------')

# train_path_list = [path.join(TRAIN_DIR, csv) for csv in csvs if csv.endswith('y_train.csv')]
# test_path_list = [path.join(TEST_DIR, csv.replace('train', 'test')) for csv in csvs if csv.endswith('y_train.csv')]

# rmse = calculate_average_rmse(train_path_list,test_path_list, features, target)
# print(rmse)

0
table:ARIMA_Sleepiness-0
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.3076501898367987	0.308	0.308	0.308	0.308	0.308	0.308	0.308	0.308	0.308
------------------------
1
table:ARIMA_Sleepiness-1
 変数の数	平均	1	2	3	4	5	6	7	8	9
 ARIMA	0.0	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
------------------------
