In [16]:
import numpy as np
import pandas as pd
import json
import os
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

global counter
counter = 0

In [17]:
def load_and_preprocess_data(file_path):
  """CSVを読み込んで前処理する関数。"""
  df = pd.read_csv(file_path)
  df = df.drop(["timestamp"], axis=1)
  return df.dropna()


def get_data_from_directory(directory_path):
  """指定ディレクトリのCSV全部を読み込んで結合する関数。"""
  files = [f for f in os.listdir(directory_path) if f.endswith(".csv")]
  data_frames = [
      load_and_preprocess_data(os.path.join(directory_path, file)) for file in files
  ]
  return pd.concat(data_frames, ignore_index=True)


def evaluate_model(model, X, y):
  """モデルの評価。RMSE"""
  predictions = model.predict(X)
  return np.sqrt(mean_squared_error(y, predictions))


def printResults(rmse_results, title, if_print=True):
  mean_rmse = np.mean(rmse_results)
  median_rmse = np.median(rmse_results)
  variance_rmse = np.var(rmse_results)
  std_rmse = np.std(rmse_results)
  min_rmse = np.min(rmse_results)
  max_rmse = np.max(rmse_results)

  out = ""
  out += f"[{title}]RMSEの平均値: {mean_rmse}\n"
  # out += f"[{title}]RMSEの中央値: {median_rmse}\n"
  # out += f"[{title}]RMSEの分散: {variance_rmse}\n"
  # out += f"[{title}]RMSEの標準偏差: {std_rmse}\n"
  out += f"[{title}]RMSEの最小値: {min_rmse}\n"
  out += f"[{title}]RMSEの最大値: {max_rmse}\n"

  if if_print:
    print(out)
  return out

  # print(f"[{title}]RMSEの平均値: {mean_rmse}")
  # # print(f"[{title}]RMSEの中央値: {median_rmse}")
  # # print(f"[{title}]RMSEの分散: {variance_rmse}")
  # # print(f"[{title}]RMSEの標準偏差: {std_rmse}")
  # print(f"[{title}]RMSEの最小値: {min_rmse}")
  # print(f"[{title}]RMSEの最大値: {max_rmse}")

In [18]:
from sklearn.model_selection import KFold


def load_and_preprocess_data_save_timestamp(file_path):
  """CSVを読み込んで前処理する関数。"""
  df = pd.read_csv(file_path)
  return df.dropna()


def extract_image_name(timestamp):
  return (
      timestamp.replace("-", "").replace(" ", "_").replace(":", "").replace(".", "_")
      + ".jpg"
  )


def train_lightgbm_full(X, y):
  """Train LightGBM model on the full training data."""
  train_data = lgb.Dataset(X, label=y)
  params = {"objective": "regression", "metric": "rmse", "verbose": -1}
  model = lgb.train(params, train_data)
  return model


def evaluate_on_test(model, X_test, y_test):
  """Evaluate model on the test data."""
  predictions = model.predict(X_test)
  return np.sqrt(mean_squared_error(y_test, predictions))


def shuffle_dataframe(df):
  return df.sample(frac=1.0).reset_index(drop=True)


def evaluate_with_kfold(X, y, n_splits=5):
  kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
  rmses = []

  for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = train_lightgbm_full(X_train, y_train)
    rmse = evaluate_on_test(model, X_test, y_test)
    rmses.append(rmse)

  return rmses


def add_eye_area_to_df(df, eye_areas):
  eye_areas_sum = []

  for _, row in df.iterrows():
    image_name = extract_image_name(row["timestamp"])
    eye_area = eye_areas.get(image_name, {})
    left_eye_area = eye_area.get("left_eye_area")
    right_eye_area = eye_area.get("right_eye_area")
    if left_eye_area is not None and right_eye_area is not None:
      try:
        eye_areas_sum.append(float(left_eye_area) + float(right_eye_area))
      except ValueError:
        eye_areas_sum.append(np.nan)
    else:
      eye_areas_sum.append(np.nan)
  df["eye_area_sum"] = eye_areas_sum
  return df

In [19]:
def print_dropped_rows_column_values(df, column_name):
    rows_to_drop = df[df.isna().any(axis=1)]
    column_values = rows_to_drop[column_name]
    # print(f"Values in '{column_name}' column of the rows that would be dropped:")
    # print(column_values)
    print(f"Rows to drop: {len(rows_to_drop)}")
    print(f"image_file_names")
    for i in column_values:
        print(extract_image_name(i))


def solve_one_subject(
    csv_filepath, json_filepath, target, descrive=False, data_name=""
):
    # 特徴量を定義
    global counter
    features = [
        "m_speed",
        "m_speed_var_480",
        "m_speed_stddev_480",
        "m_acceleration",
        "m_acceleration_var_480",
        "m_acceleration_stddev_480",
        "m_jerk",
        "m_jerk_var_480",
        "m_jerk_stddev_480",
        "oss",
        "perclos",
        "Sleepiness",
        "eye_area_sum",
    ]
    outStr = "train,test同時に正規化\n"

    data = load_and_preprocess_data_save_timestamp(csv_filepath)
    with open(json_filepath, "r") as f:
        eye_areas = json.load(f)
    data = add_eye_area_to_df(data, eye_areas)

    # 正規化
    scaler = MinMaxScaler()
    data[features] = scaler.fit_transform(data[features])

    # print_dropped_rows_column_values(data, "timestamp")
    data = data.dropna()
    data = shuffle_dataframe(data)
    counter += len(data)

    # X = data[list(filter(lambda x: x != target, features))]
    X = data[features]
    Y_target = data[target]
    if descrive:
        print(data.describe())

    target_kfold_rmses_norm = evaluate_with_kfold(X, Y_target)
    outStr += printResults(
        target_kfold_rmses_norm, f"lgbm-{target}-normalized-kfold-single-sbj", False
    )
    return {"out": outStr, "t": target_kfold_rmses_norm}

In [20]:
ids = [123, 456, 789]  # 例えば、これが出力するIDのリスト
output = ",".join(str(id) for id in ids)
print(output)

123,456,789


In [21]:
DATA_DIR = "dms_data_single"
EYE_DATA_DIR = "eye_data_json_dir"

file_names = [
    "20201126_1546_0_y",
    "20201127_1432_7_y",
    "20201127_1548_2_y",
    "20201127_1701_7_y",
    "20201127_1840_5_y",
    "20201130_1122_5_y",
    "20201130_1808_6_y",
    "20201201_1230_0_y",
    "20201201_1429_5_y",
    "20201201_1555_0_y",
    "20201203_1022_7_y",
    "20201203_1244_5_y",
    "20201203_1404_6_y",
    "20201210_1112_2_y",
    "20201210_1354_2_y",
    "20201210_1610_6_y",
]

VERBOSE = False

In [22]:
target = "Sleepiness"

target_rmses = []
for data_name in file_names:
  if VERBOSE:
    print(data_name)
  csv_filename = f"{data_name}.csv"
  json_filename = f"{data_name}_eye_areas.json"
  csv_filepath = os.path.join(DATA_DIR, csv_filename)
  json_filepath = os.path.join(EYE_DATA_DIR, json_filename)
  result = solve_one_subject(csv_filepath, json_filepath, target, data_name=data_name)
  if VERBOSE:
    print(result["out"])
  target_rmses.append(result["t"])

print(printResults(target_rmses, f"lgbm-target-normalized-kfold-all-sbj"))

print(counter)

[lgbm-target-normalized-kfold-all-sbj]RMSEの平均値: 0.059309092606576144
[lgbm-target-normalized-kfold-all-sbj]RMSEの最小値: 0.001954400132893415
[lgbm-target-normalized-kfold-all-sbj]RMSEの最大値: 0.20828299473518475

[lgbm-target-normalized-kfold-all-sbj]RMSEの平均値: 0.059309092606576144
[lgbm-target-normalized-kfold-all-sbj]RMSEの最小値: 0.001954400132893415
[lgbm-target-normalized-kfold-all-sbj]RMSEの最大値: 0.20828299473518475

2152


|target|perclos|eye_area_sum|
|---|---|---|
|oss|0.05967|0.06001|
|Sleepiness|0.05814|0.06000|