In [1]:
# ================================================
# 0) Imports
# ================================================
import os, re, random
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

In [2]:
# ================================================
# 1) 데이터 로드 & 기본 전처리
# ================================================
cols = ["subject", "label", "timestamp", "x", "y", "z"]
df = pd.read_csv(
    "/content/drive/MyDrive/data/WISDM_ar_v1.1_raw.txt",
    header=None, names=cols, on_bad_lines="skip"
).dropna()

df["z"] = df["z"].astype(str).str.replace(";", "", regex=False).astype(float)
df["x"] = df["x"].astype(float)
df["y"] = df["y"].astype(float)

df["subject"] = pd.to_numeric(df["subject"], errors="coerce").astype("Int64")
df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")
for c in ["x","y","z"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna().sort_values(["subject","label","timestamp"]).reset_index(drop=True)

print("샘플:\n", df.head(), "\n")
print("고유 subject:", df["subject"].nunique())
print("라벨 분포:", Counter(df["label"]))

샘플:
    subject       label      timestamp     x      y     z
0        1  Downstairs  6552942304000 -0.15   9.15 -0.34
1        1  Downstairs  6552992292000  0.11   9.19  2.76
2        1  Downstairs  6553042310000 -4.06   7.40  4.02
3        1  Downstairs  6553092298000 -2.87   7.93  3.21
4        1  Downstairs  6553142347000 -0.19  10.04  4.82 

고유 subject: 36
라벨 분포: Counter({'Walking': 418393, 'Jogging': 336445, 'Upstairs': 122869, 'Downstairs': 100425, 'Sitting': 59939, 'Standing': 48394})


In [3]:
# ================================================
# 2) get_frames (문자 라벨 대응: np.unique로 최빈값)
# ================================================
Fs = 20
frame_size = 200           # 200 timestep (10초 분량)
hop_size   = frame_size//2 # 50% overlap = 100

def get_frames(df, frame_size, hop_size):
    N_FEATURES = 3
    frames, labels = [], []
    xv = df["x"].to_numpy(); yv = df["y"].to_numpy(); zv = df["z"].to_numpy()
    lv = df["label"].to_numpy()
    for i in range(0, len(df) - frame_size, hop_size):
        x = xv[i:i+frame_size]; y = yv[i:i+frame_size]; z = zv[i:i+frame_size]
        seg = lv[i:i+frame_size]
        vals, counts = np.unique(seg, return_counts=True)
        label = vals[np.argmax(counts)]
        frames.append([x,y,z])
        labels.append(label)
    frames = np.asarray(frames).reshape(-1, frame_size, N_FEATURES)
    labels = np.asarray(labels)
    return frames, labels


In [4]:
# ================================================
# 2-1) Subject-wise wrapper
# ================================================
def run_subjectwise(get_frames_func, df, group_col, frame_size, hop_size):
    X_list, y_list = [], []
    for _, g in df.groupby(group_col, sort=False):
        # 이미 위에서 timestamp 정렬했지만 안전하게 한 번 더 보장하려면 주석 해제
        # g = g.sort_values("timestamp")
        Xg, yg = get_frames_func(g, frame_size, hop_size)
        if len(yg) == 0:
            continue
        X_list.append(Xg)
        y_list.append(yg)
    if not X_list:
        return np.empty((0, frame_size, N_FEATURES)), np.array([])
    return np.vstack(X_list), np.concatenate(y_list)

In [5]:
# ================================================
# 3) Subject-wise Train/Test Split
#    예: subject <= 30 → train, >30 → test
# ================================================
df_train = df[df["subject"] <= 30].copy()
df_test  = df[df["subject"] >  30].copy()

X_train_raw, y_train_raw = run_subjectwise(get_frames, df_train, "subject", frame_size, hop_size)
X_test_raw,  y_test_raw  = run_subjectwise(get_frames, df_test,  "subject", frame_size, hop_size)

print("X_train_raw:", X_train_raw.shape, " / X_test_raw:", X_test_raw.shape)
print("Train 라벨 분포:", Counter(y_train_raw))
print("Test  라벨 분포:", Counter(y_test_raw))

X_train_raw: (8821, 200, 3)  / X_test_raw: (1989, 200, 3)
Train 라벨 분포: Counter({np.str_('Walking'): 3446, np.str_('Jogging'): 2696, np.str_('Upstairs'): 1027, np.str_('Downstairs'): 812, np.str_('Sitting'): 457, np.str_('Standing'): 383})
Test  라벨 분포: Counter({np.str_('Walking'): 702, np.str_('Jogging'): 667, np.str_('Upstairs'): 200, np.str_('Downstairs'): 175, np.str_('Sitting'): 143, np.str_('Standing'): 102})


In [6]:
# ================================================
# 3-1) magnitude 채널 추가 (각 세트별로)
# ================================================
def add_magnitude_channel(X):
    # X: (N,T,3) -> cat ||v||: (N,T,1) -> (N,T,4)
    mag = np.linalg.norm(X, axis=2, keepdims=True)
    return np.concatenate([X, mag], axis=2)

X_train_raw = add_magnitude_channel(X_train_raw)   # (N,T,4)
X_test_raw  = add_magnitude_channel(X_test_raw)    # (N,T,4)

In [7]:
# ================================================
# 3-2) 라벨 인코딩 (정수)
# ================================================
le = LabelEncoder()
le.fit(df["label"])
y_train = le.transform(y_train_raw)
y_test  = le.transform(y_test_raw)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))

Classes: ['Downstairs', 'Jogging', 'Sitting', 'Standing', 'Upstairs', 'Walking']


In [8]:
# ================================================
# 4) 표준화 (Train 기준으로 fit → Train/Test transform)
#     채널별 정규화: (N*T, C)
# ================================================
scaler = StandardScaler()
X_train_2d = X_train_raw.reshape(-1, X_train_raw.shape[-1])  # (N*T, 4)
X_test_2d  = X_test_raw.reshape(-1,  X_test_raw.shape[-1])

scaler.fit(X_train_2d)
X_train = scaler.transform(X_train_2d).reshape(X_train_raw.shape)
X_test  = scaler.transform(X_test_2d ).reshape(X_test_raw.shape)

print("Shapes ->",
      "X_train:", X_train.shape,
      "X_test:", X_test.shape)


Shapes -> X_train: (8821, 200, 4) X_test: (1989, 200, 4)


In [9]:
# ================================================
# 5) CNN + BiLSTM 모델 정의
# ================================================
def build_cnn_lstm(input_shape, num_classes):
    inp = layers.Input(shape=input_shape)   # (T, 4)

    # CNN block
    x = layers.Conv1D(64, kernel_size=5, activation="relu", padding="same")(inp)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Conv1D(128, kernel_size=5, activation="relu", padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Dropout(0.3)(x)

    # BiLSTM block
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    x = layers.Dropout(0.3)(x)

    # Dense block
    x = layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(num_classes, activation="softmax")(x)

    model = models.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="sparse_categorical_crossentropy",  # 정수 라벨
        metrics=["accuracy"]
    )
    return model

model = build_cnn_lstm((frame_size, 4), num_classes)
model.summary()

In [10]:
# ================================================
# 6) class_weight (Train 분포로 계산)
# ================================================
cw = compute_class_weight(class_weight="balanced",
                          classes=np.unique(y_train),
                          y=y_train)
class_weight = {i: w for i, w in enumerate(cw)}
print("class_weight:", class_weight)

class_weight: {0: np.float64(1.8105500821018063), 1: np.float64(0.5453140454995055), 2: np.float64(3.2169948942377826), 3: np.float64(3.838555265448216), 4: np.float64(1.4315157416423239), 5: np.float64(0.4266299090733217)}


In [11]:
# ================================================
# 7) 학습
#   주의: validation_split=0.2는 같은 subject의 윈도우가
#   train/val에 섞일 수 있음(빠른 확인용).
#   더 엄밀히 하려면 train subject 중 일부를 검증 전용 subject로 분리하세요.
# ================================================
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=60,
    batch_size=128,
    class_weight=class_weight,
    verbose=1
)


Epoch 1/60
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 42ms/step - accuracy: 0.5583 - loss: 1.0851 - val_accuracy: 0.4113 - val_loss: 1.1479
Epoch 2/60
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7947 - loss: 0.5171 - val_accuracy: 0.3003 - val_loss: 2.1735
Epoch 3/60
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.8963 - loss: 0.2965 - val_accuracy: 0.2686 - val_loss: 2.2780
Epoch 4/60
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.9280 - loss: 0.2349 - val_accuracy: 0.3892 - val_loss: 2.0087
Epoch 5/60
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9496 - loss: 0.1775 - val_accuracy: 0.4125 - val_loss: 1.9518
Epoch 6/60
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.9559 - loss: 0.1506 - val_accuracy: 0.5224 - val_loss: 1.9055
Epoch 7/60
[1m56/56[0m [32m━━━

In [12]:
# ================================================
# 8) 평가
# ================================================
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"[Subject-wise Test] loss={test_loss:.4f}  acc={test_acc:.4f}")

y_pred = np.argmax(model.predict(X_test, verbose=0), axis=1)

print("\nClassification Report (Subject-wise)")
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix (rows=true, cols=pred):\n", cm)

[Subject-wise Test] loss=0.4500  acc=0.9140

Classification Report (Subject-wise)
              precision    recall  f1-score   support

  Downstairs       0.67      0.92      0.78       175
     Jogging       1.00      0.96      0.98       667
     Sitting       0.75      0.99      0.86       143
    Standing       0.98      0.57      0.72       102
    Upstairs       0.91      0.88      0.89       200
     Walking       0.96      0.91      0.94       702

    accuracy                           0.91      1989
   macro avg       0.88      0.87      0.86      1989
weighted avg       0.93      0.91      0.92      1989


Confusion Matrix (rows=true, cols=pred):
 [[161   1   0   0   8   5]
 [  5 642   2   0   0  18]
 [  0   0 142   1   0   0]
 [  0   0  44  58   0   0]
 [ 21   0   1   0 175   3]
 [ 53   0   0   0   9 640]]
