In [1]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.fbprophet import Prophet

DATA_DIR = './dms_data'
N_TRIALS = 50

features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
            'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
            'm_jerk_var_480', 'm_jerk_stddev_480']


2023-12-18 20:09:43.202899: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 20:09:43.245371: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 20:09:43.245872: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_and_preprocess_data(file_path):
  df = pd.read_csv(file_path)
  return df.drop(['timestamp'], axis=1).dropna()


def get_data_from_directory(directory_path):
  files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
  dfs = [load_and_preprocess_data(os.path.join(directory_path, f)) for f in files]
  return pd.concat(dfs, ignore_index=True)


train_data = get_data_from_directory(os.path.join(DATA_DIR, 'train'))
test_data = get_data_from_directory(os.path.join(DATA_DIR, 'test'))

In [3]:
def evaluate_model(model, X, y):
  predictions = model.predict(X)
  return np.sqrt(mean_squared_error(y, predictions))

def print_results(rmse_results, title):
  stats = {'平均値': np.mean, '中央値': np.median, '分散': np.var,
           '標準偏差': np.std, '最小値': np.min, '最大値': np.max}
  for stat, func in stats.items():
    print(f"[{title}] RMSEの{stat}: {func(rmse_results)}")

def get_data_lengths(data_dir, datasets=['test', 'train']):
  data_lengths = []
  for dataset in datasets:
    path = os.path.join(data_dir, dataset)
    for file in os.listdir(path):
      if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(path, file))
        data_lengths.append((dataset, file, len(df)))
  return data_lengths


In [4]:
data_lengths = get_data_lengths(DATA_DIR)
min_length_data = min(data_lengths, key=lambda x: x[2])

excluded_subj = '20201127_1432_7'
filtered_data_lengths = [d for d in data_lengths if excluded_subj not in d[1]]
regressor = RandomForestRegressor()
rf = make_reduction(regressor, strategy="recursive", window_length=12, scitype="infer")
arima = AutoARIMA(sp=12, suppress_warnings=True)
prophet = Prophet()

models = {"RandomForest": rf, "ARIMA": arima, "Prophet": prophet}
# models = {"RandomForest": rf, "Prophet": prophet}

Importing plotly failed. Interactive plots will not work.


In [5]:
from sktime.forecasting.base import ForecastingHorizon

def train_and_evaluate(models, data, split_ratio=0.8):
  results, predictions, actual_values = {}, {}, {}
  for model_name, model in models.items():
    all_errors, all_preds, all_actual = [], [], []
    for (dataset_type, csv_file), df in data.items():
      split_point = int(len(df) * split_ratio)
      y_train, y_test = df.iloc[:split_point], df.iloc[split_point:]
      if y_train.isna().any() or y_test.isna().any():
        continue

      fh = ForecastingHorizon(y_test.index, is_relative=False)
      model.fit(y_train)
      y_pred = model.predict(fh=fh)

      if np.isnan(y_pred).any():
        continue
      all_preds.extend(y_pred)
      all_actual.extend(y_test)
      all_errors.append(mean_squared_error(y_test, y_pred))
    results[model_name] = np.mean(all_errors)
    predictions[model_name] = all_preds
    actual_values[model_name] = all_actual
  return results, predictions, actual_values



data = {}
for dataset_type in ['test', 'train']:
  path = os.path.join(DATA_DIR, dataset_type)
  for file in os.listdir(path):
    if file.endswith(".csv") and excluded_subj not in file:
      df = pd.read_csv(os.path.join(path, file))
      df['timestamp'] = pd.to_datetime(df['timestamp'])
      df.set_index('timestamp', inplace=True)
      df = df.asfreq('20MS')
      df = df.dropna()
      data[(dataset_type, file)] = df

# モデルのトレーニングと評価
results, predictions, actual_values = train_and_evaluate(models, data)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().