In [51]:
# ================================================
# 0) Imports
# ================================================
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.utils import to_categorical

In [52]:
# ================================================
# 1) 데이터 로드 & 기본 전처리
# ================================================
cols = ["subject", "label", "timestamp", "x", "y", "z"]
df = pd.read_csv(
    "/content/drive/MyDrive/data/WISDM_ar_v1.1_raw.txt",
    header=None, names=cols, on_bad_lines="skip"
).dropna()

# z 끝 세미콜론 제거 + float 변환
df["z"] = df["z"].astype(str).str.replace(";", "", regex=False).astype(float)
df["x"] = df["x"].astype(float)
df["y"] = df["y"].astype(float)

print("샘플:\n", df.head(), "\n")
print("라벨 분포:", Counter(df["label"]))


샘플:
    subject    label       timestamp         x          y         z
0       33  Jogging  49105962326000 -0.694638  12.680544  0.503953
1       33  Jogging  49106062271000  5.012288  11.264028  0.953424
2       33  Jogging  49106112167000  4.903325  10.882658 -0.081722
3       33  Jogging  49106222305000 -0.612916  18.496431  3.023717
4       33  Jogging  49106332290000 -1.184970  12.108489  7.205164 

라벨 분포: Counter({'Walking': 418393, 'Jogging': 336445, 'Upstairs': 122869, 'Downstairs': 100425, 'Sitting': 59939, 'Standing': 48394})


In [53]:
# ================================================
# 2) Sliding Window
#    최빈 라벨로 프레임 라벨 부여
# ================================================
Fs = 20
frame_size = Fs*4 # 80
hop_size = Fs*2 # 40 -> 50% 오버래핑

N_FEATURES = 3

def get_frames(df, frame_size, hop_size):
    frames, labels = [], []
    lab = df["label"].to_numpy()
    xv = df["x"].to_numpy(); yv = df["y"].to_numpy(); zv = df["z"].to_numpy()

    for i in range(0, len(df) - frame_size, hop_size):
        x = xv[i:i+frame_size]
        y = yv[i:i+frame_size]
        z = zv[i:i+frame_size]

        seg = lab[i:i+frame_size]
        vals, counts = np.unique(seg, return_counts=True)
        label = vals[np.argmax(counts)]  # 최빈값

        frames.append([x, y, z])
        labels.append(label)

    frames = np.asarray(frames).reshape(-1, frame_size, N_FEATURES)
    labels = np.asarray(labels)
    return frames, labels

X, y_raw = get_frames(df, frame_size, hop_size)
print("X shape:", X.shape)
print("프레임 기준 라벨 분포:", Counter(y_raw))

X shape: (27160, 80, 3)
프레임 기준 라벨 분포: Counter({np.str_('Walking'): 10462, np.str_('Jogging'): 8412, np.str_('Upstairs'): 3068, np.str_('Downstairs'): 2513, np.str_('Sitting'): 1494, np.str_('Standing'): 1211})


In [54]:
# ================================================
# 2-1) Subject-wise wrapper (사용자 경계 유지)
# ================================================
def run_subjectwise(get_frames_func, df, group_col, frame_size, hop_size):
    X_list, y_list = [], []
    for _, g in df.groupby(group_col, sort=False):
        # 시계열 안정화를 위해 시간 정렬 권장
        if "timestamp" in g.columns:
            g = g.sort_values("timestamp")
        Xg, yg = get_frames_func(g, frame_size, hop_size)
        if len(yg) == 0:
            continue
        X_list.append(Xg)
        y_list.append(yg)
    if not X_list:
        return np.empty((0, frame_size, N_FEATURES)), np.array([])
    return np.vstack(X_list), np.concatenate(y_list)

In [55]:
# ================================================
# 3) Subject-wise Train/Test Split
#    예: subject <= 30 → train, >30 → test
# ================================================
df_train = df[df["subject"] <= 30].copy()
df_test  = df[df["subject"] >  30].copy()

X_train, y_train_raw = run_subjectwise(get_frames, df_train, "subject", frame_size, hop_size)
X_test,  y_test_raw  = run_subjectwise(get_frames, df_test,  "subject", frame_size, hop_size)

print("X_train:", X_train.shape, " / X_test:", X_test.shape)
print("Train 라벨 분포:", Counter(y_train_raw))
print("Test  라벨 분포:", Counter(y_test_raw))

X_train: (22121, 80, 3)  / X_test: (4987, 80, 3)
Train 라벨 분포: Counter({np.str_('Walking'): 8665, np.str_('Jogging'): 6749, np.str_('Upstairs'): 2555, np.str_('Downstairs'): 2064, np.str_('Sitting'): 1145, np.str_('Standing'): 943})
Test  라벨 분포: Counter({np.str_('Walking'): 1770, np.str_('Jogging'): 1669, np.str_('Upstairs'): 498, np.str_('Downstairs'): 444, np.str_('Sitting'): 354, np.str_('Standing'): 252})


In [56]:
# ================================================
# 4) 라벨 인코딩 (원-핫)
#    클래스 일관성을 위해 전체 df로 fit
# ================================================
le = LabelEncoder()
le.fit(df["label"])
y_train_int = le.transform(y_train_raw)
y_test_int  = le.transform(y_test_raw)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))

y_train = to_categorical(y_train_int, num_classes=num_classes)
y_test  = to_categorical(y_test_int,  num_classes=num_classes)

Classes: ['Downstairs', 'Jogging', 'Sitting', 'Standing', 'Upstairs', 'Walking']


In [57]:
# ================================================
# 5) 표준화 (채널별, Train으로 fit → Train/Test transform)
# ================================================
scaler = StandardScaler()
X_train_2d = X_train.reshape(-1, X_train.shape[-1])  # (윈도우*타임, 3)
X_test_2d  = X_test.reshape(-1,  X_test.shape[-1])

scaler.fit(X_train_2d)
X_train = scaler.transform(X_train_2d).reshape(X_train.shape)
X_test  = scaler.transform(X_test_2d ).reshape(X_test.shape)

print("Shapes ->",
      "X_train:", X_train.shape,
      "X_test:", X_test.shape,
      "y_train:", y_train.shape,
      "y_test:", y_test.shape)


Shapes -> X_train: (22121, 80, 3) X_test: (4987, 80, 3) y_train: (22121, 6) y_test: (4987, 6)


In [58]:
# ================================================
# 6) LSTM 모델 (Sequential)
#    LSTM(128)->Dropout(0.5)->LSTM(64)->Dropout(0.5)
#    Dense(128)->Dropout(0.5)->Dense(num_classes, softmax)
# ================================================
def build_lstm(input_shape, num_classes):
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="categorical_crossentropy",  # one-hot 라벨
        metrics=["accuracy"]
    )
    return model

model = build_lstm((frame_size, N_FEATURES), num_classes)
model.summary()

  super().__init__(**kwargs)


In [59]:
# ================================================
# 7) 학습
#    주의: validation_split=0.2는 같은 subject의 윈도우가
#    train/val에 섞일 수 있음(빠른 검증용).
#    더 엄밀하게 하려면 train subject 중 일부를 '검증 전용 subject'로 분리하세요.
# ================================================
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=128,   # LSTM은 연산량↑ → 128 권장
    verbose=1
)

Epoch 1/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.4737 - loss: 1.3625 - val_accuracy: 0.7772 - val_loss: 0.7667
Epoch 2/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.7062 - loss: 0.8452 - val_accuracy: 0.8104 - val_loss: 0.6767
Epoch 3/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7569 - loss: 0.6876 - val_accuracy: 0.7417 - val_loss: 0.8071
Epoch 4/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7618 - loss: 0.6550 - val_accuracy: 0.7275 - val_loss: 0.8222
Epoch 5/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.7858 - loss: 0.5881 - val_accuracy: 0.7912 - val_loss: 0.7401
Epoch 6/30
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8007 - loss: 0.5456 - val_accuracy: 0.7708 - val_loss: 0.7624
Epoch 7/30
[1m139/139

In [60]:

# ================================================
# 8) 평가
# ================================================
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"[Subject-wise Test] loss={test_loss:.4f}  acc={test_acc:.4f}")

y_pred_int = np.argmax(model.predict(X_test, verbose=0), axis=1)

print("\nClassification Report (Subject-wise)")
print(classification_report(y_test_int, y_pred_int, target_names=list(le.classes_)))

cm = confusion_matrix(y_test_int, y_pred_int)
print("\nConfusion Matrix (rows=true, cols=pred):\n", cm)

[Subject-wise Test] loss=0.4516  acc=0.8821

Classification Report (Subject-wise)
              precision    recall  f1-score   support

  Downstairs       0.58      0.69      0.63       444
     Jogging       1.00      0.88      0.94      1669
     Sitting       0.90      0.99      0.95       354
    Standing       0.96      0.86      0.91       252
    Upstairs       0.67      0.82      0.73       498
     Walking       0.93      0.93      0.93      1770

    accuracy                           0.88      4987
   macro avg       0.84      0.86      0.85      4987
weighted avg       0.90      0.88      0.89      4987


Confusion Matrix (rows=true, cols=pred):
 [[ 307    3    3    5   88   38]
 [  92 1472    0    0   57   48]
 [   0    0  352    2    0    0]
 [   0    0   34  216    2    0]
 [  60    0    0    2  406   30]
 [  68    1    0    0   55 1646]]
