#### 特徴量について検証

In [10]:
data_dir = 'dms_data'
import os
import pandas as pd
import numpy as np
import itertools
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LinearRegression

In [11]:


def train_and_evaluate(model_type, X_train, y_train, X_test, y_test):
    if model_type == "lgbm":
        train_data = lgb.Dataset(X_train, label=y_train)
        model = lgb.train({'verbose':-1}, train_data)
    elif model_type == "nn":
        model = Sequential()
        model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer=Adam())
        model.fit(X_train, y_train, epochs=10, verbose=0)
    elif model_type == "linear_regression":
        model = LinearRegression().fit(X_train, y_train)
    elif model_type == "svr":
        model = SVR().fit(X_train, y_train)
    elif model_type == "random_forest":
        model = RandomForestRegressor().fit(X_train, y_train)
    
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    
    return model, rmse


In [12]:
def find_best_combination(model_type, combinations, train, test):
    rmses = []
    for feature_comb in combinations:
        _, rmse = train_and_evaluate(model_type, train[feature_comb], train['oss'], test[feature_comb], test['oss'])
        rmses.append(rmse)

    best_comb = combinations[np.argmin(rmses)]
    print(rmses)
    return best_comb


In [13]:
def read_and_process(directory):
  files = os.listdir(directory)
  df_list = []
  for file in files:
    if file.endswith('.csv'):
      df = pd.read_csv(os.path.join(directory, file))
      df = df.drop(['timestamp'], axis=1)
      df = df.dropna()
      df_list.append(df)
  df_combined = pd.concat(df_list, ignore_index=True)
  
  return df_combined

In [14]:
def show_result():
    # Define directory paths
  train_data_path = data_dir + '/train'
  test_data_path = data_dir + '/test'

  # Read and process data
  train = read_and_process(train_data_path)
  test = read_and_process(test_data_path)

  # Define features
  features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
              'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
              'm_jerk_var_480', 'm_jerk_stddev_480']

  # Generate all combinations of features
  feature_combinations = []
  for r in range(1, len(features) + 1):
    for subset in itertools.combinations(features, r):
      feature_combinations.append(list(subset))

  # Define models
  models = ["lgbm", "nn", "linear_regression", "svr", "random_forest"]

  # Find the best combination for each model
  for model in models:
    print('model: ' + model)
    best_comb = find_best_combination(model, [features], train, test)
    print(f"Best feature combination for {model}: {best_comb}")

show_result()


model: lgbm
[0.7448632962609215]
Best feature combination for lgbm: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: nn
[0.7911743571740701]
Best feature combination for nn: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: linear_regression
[0.6548148950163442]
Best feature combination for linear_regression: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: svr
[0.694881610724204]
Best feature combination for svr: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: ra

[0.7114483681803272]
Best feature combination for random_forest: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']


In [15]:
def show_result():
    # Define directory paths
  train_data_path = data_dir + '/train'
  test_data_path = data_dir + '/test'

  # Read and process data
  train = read_and_process(train_data_path)
  test = read_and_process(test_data_path)

  # Define features
  features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
              'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
              'm_jerk_var_480', 'm_jerk_stddev_480']

  # Generate all combinations of features
  feature_combinations = []
  for r in range(1, len(features) + 1):
    for subset in itertools.combinations(features, r):
      feature_combinations.append(list(subset))

  # Define models
  models = ["lgbm", "nn", "linear_regression", "svr", "random_forest"]

  # Find the best combination for each model
  for model in models:
    print('model: ' + model)
    best_comb = find_best_combination(model, [features], train, test)
    print(f"Best feature combination for {model}: {best_comb}")


show_result()

model: lgbm
[0.7448632962609215]
Best feature combination for lgbm: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: nn
[0.6670130662064958]
Best feature combination for nn: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: linear_regression
[0.6548148950163442]
Best feature combination for linear_regression: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: svr
[0.694881610724204]
Best feature combination for svr: ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration', 'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk', 'm_jerk_var_480', 'm_jerk_stddev_480']
model: ra

In [16]:
data_dir = 'dms_data'
# Define directory paths
train_data_path = data_dir + '/train'
test_data_path = data_dir + '/test'

# Read and process data
train = read_and_process(train_data_path)
test = read_and_process(test_data_path)
features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
            'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
            'm_jerk_var_480', 'm_jerk_stddev_480']
X_train, y_train, X_test, y_test = train[features], train['oss'], test[features], test['oss']


In [17]:
def train_and_evaluate(model_type, X_train, y_train, X_test, y_test):
  if model_type == "lgbm":
      train_data = lgb.Dataset(X_train, label=y_train)
      model = lgb.train({'verbose':-1}, train_data)
  elif model_type == "nn":
      model = Sequential()
      model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
      model.add(Dense(32, activation='relu'))
      model.add(Dense(1))
      model.compile(loss='mean_squared_error', optimizer=Adam())
      model.fit(X_train, y_train, epochs=10, verbose=0)
  elif model_type == "linear_regression":
      model = LinearRegression().fit(X_train, y_train)
  elif model_type == "svr":
      model = SVR().fit(X_train, y_train)
  elif model_type == "random_forest":
      model = RandomForestRegressor().fit(X_train, y_train)
  
  preds = model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, preds))
  
  return model, rmse

def find_best_combination(model_type, combinations, train, test):
  rmses = []
  for feature_comb in combinations:
      _, rmse = train_and_evaluate(model_type, train[feature_comb], train['oss'], test[feature_comb], test['oss'])
      rmses.append(rmse)

  min_rmse_index = np.argmin(rmses)
  best_comb = combinations[min_rmse_index]
  best_rmse = rmses[min_rmse_index]
  return best_comb, best_rmse

def read_and_process(directory):
  files = os.listdir(directory)
  df_list = []
  for file in files:
      if file.endswith('.csv'):
          df = pd.read_csv(os.path.join(directory, file))
          df = df.drop(['timestamp'], axis=1)
          df = df.dropna()
          df_list.append(df)
  df_combined = pd.concat(df_list, ignore_index=True)

  return df_combined

def with_RMSE():
  data_dir = "./dms_data/"
  # Define directory paths
  train_data_path = data_dir + '/train'
  test_data_path = data_dir + '/test'

  # Read and process data
  train = read_and_process(train_data_path)
  test = read_and_process(test_data_path)

  # Define features
  features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
              'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
              'm_jerk_var_480', 'm_jerk_stddev_480']

  # Generate all combinations of features
  feature_combinations = []
  for r in range(1, len(features) + 1):
      for subset in itertools.combinations(features, r):
          feature_combinations.append(list(subset))

  # Define models
  models = ["lgbm", "nn", "linear_regression", "svr", "random_forest"]

  # Find the best combination for each model
  for model in models:
      print('model: ' + model)
      best_comb, best_rmse = find_best_combination(model, feature_combinations, train, test)
      print(f"Best feature combination for {model}: {best_comb}")
      print(f"Best RMSE for {model}: {best_rmse}")
with_RMSE()

model: lgbm


KeyboardInterrupt: 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import itertools
import numpy as np
import pandas as pd
import os
data_dir = './dms_data/'


def train_and_evaluate(model_type, X_train, y_train, X_test, y_test):
  if model_type == "lgbm":
    train_data = lgb.Dataset(X_train, label=y_train)
    model = lgb.train({'verbose': -1}, train_data)
  elif model_type == "nn":
    model = Sequential()
    model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer=Adam())
    model.fit(X_train, y_train, epochs=10, verbose=0)
  elif model_type == "linear_regression":
    model = LinearRegression().fit(X_train, y_train)
  elif model_type == "svr":
    model = SVR().fit(X_train, y_train)
  elif model_type == "random_forest":
    model = RandomForestRegressor().fit(X_train, y_train)

  preds = model.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, preds))

  return model, rmse


def find_best_combination(model_type, combinations, train, test):
  rmses = []
  for feature_comb in combinations:
    _, rmse = train_and_evaluate(model_type, train[feature_comb], train['oss'], test[feature_comb], test['oss'])
    rmses.append(rmse)

  min_rmse_index = np.argmin(rmses)
  best_comb = combinations[min_rmse_index]
  best_rmse = rmses[min_rmse_index]
  return best_comb, best_rmse


def read_and_process(directory, features, scaler=None, fit_scaler=False):
  files = os.listdir(directory)
  df_list = []
  for file in files:
    if file.endswith('.csv'):
      df = pd.read_csv(os.path.join(directory, file))
      df = df.drop(['timestamp'], axis=1)
      df = df.dropna()

      if fit_scaler:
        scaler.fit(df[features])
      if scaler is not None:
        df[features] = scaler.transform(df[features])
      df_list.append(df)
  df_combined = pd.concat(df_list, ignore_index=True)

  return df_combined, scaler


# Define features
features = ['m_speed', 'm_speed_var_480', 'm_speed_stddev_480', 'm_acceleration',
            'm_acceleration_var_480', 'm_acceleration_stddev_480', 'm_jerk',
            'm_jerk_var_480', 'm_jerk_stddev_480']

# Define directory paths
train_data_path = data_dir + '/train'
test_data_path = data_dir + '/test'

# Read and process data
scaler = StandardScaler()
train, scaler = read_and_process(train_data_path, features, scaler, fit_scaler=True)
test, _ = read_and_process(test_data_path, features, scaler)



# Generate all combinations of features
feature_combinations = []
for r in range(1, len(features) + 1):
  for subset in itertools.combinations(features, r):
    feature_combinations.append(list(subset))

# Define models
models = ["lgbm", "nn", "linear_regression", "svr", "random_forest"]

# Find the best combination for each model
for model in models:
  print('model: ' + model)
  best_comb, best_rmse = find_best_combination(model, feature_combinations, train, test)
  print(f"Best feature combination for {model}: {best_comb}")
  print(f"Best RMSE for {model}: {best_rmse}")


ValueError: Found array with 0 sample(s) (shape=(0, 9)) while a minimum of 1 is required by StandardScaler.