In [31]:
# ================================================
# 0) Imports
# ================================================
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization, Dropout, Flatten, Dense
from tensorflow.keras.utils import to_categorical

In [32]:
# ================================================
# 1) 데이터 로드 & 기본 전처리
# ================================================
cols = ["subject", "label", "timestamp", "x", "y", "z"]

df = pd.read_csv(
    "/content/drive/MyDrive/data/WISDM_ar_v1.1_raw.txt",
    header=None,
    names=cols,
    on_bad_lines="skip"
).dropna()

df["z"] = df["z"].astype(str).str.replace(";", "", regex=False).astype(float)
df["x"] = df["x"].astype(float)
df["y"] = df["y"].astype(float)

print("샘플:\n", df.head(), "\n")
print("라벨 분포:", Counter(df["label"]))

샘플:
    subject    label       timestamp         x          y         z
0       33  Jogging  49105962326000 -0.694638  12.680544  0.503953
1       33  Jogging  49106062271000  5.012288  11.264028  0.953424
2       33  Jogging  49106112167000  4.903325  10.882658 -0.081722
3       33  Jogging  49106222305000 -0.612916  18.496431  3.023717
4       33  Jogging  49106332290000 -1.184970  12.108489  7.205164 

라벨 분포: Counter({'Walking': 418393, 'Jogging': 336445, 'Upstairs': 122869, 'Downstairs': 100425, 'Sitting': 59939, 'Standing': 48394})


In [33]:
df

Unnamed: 0,subject,label,timestamp,x,y,z
0,33,Jogging,49105962326000,-0.694638,12.680544,0.503953
1,33,Jogging,49106062271000,5.012288,11.264028,0.953424
2,33,Jogging,49106112167000,4.903325,10.882658,-0.081722
3,33,Jogging,49106222305000,-0.612916,18.496431,3.023717
4,33,Jogging,49106332290000,-1.184970,12.108489,7.205164
...,...,...,...,...,...,...
1086461,19,Sitting,131623331483000,9.000000,-1.570000,1.690000
1086462,19,Sitting,131623371431000,9.040000,-1.460000,1.730000
1086463,19,Sitting,131623411592000,9.080000,-1.380000,1.690000
1086464,19,Sitting,131623491487000,9.000000,-1.460000,1.730000


In [34]:
# ================================================
# 2) Sliding Window
# ================================================
Fs = 20
frame_size = Fs*4 # 80
hop_size = Fs*2 # 40 -> 50% 오버래핑

N_FEATURES = 3

def get_frames(df, frame_size, hop_size):
    frames, labels = [], []
    lab = df['label'].to_numpy()
    xv = df['x'].to_numpy(); yv = df['y'].to_numpy(); zv = df['z'].to_numpy()

    for i in range(0, len(df) - frame_size, hop_size):
        x = xv[i: i + frame_size]
        y = yv[i: i + frame_size]
        z = zv[i: i + frame_size]

        seg = lab[i: i + frame_size]
        vals, counts = np.unique(seg, return_counts=True)
        label = vals[np.argmax(counts)]  # 최빈값

        frames.append([x, y, z])
        labels.append(label)

    frames = np.asarray(frames).reshape(-1, frame_size, N_FEATURES)
    labels = np.asarray(labels)
    return frames, labels

X, y_raw = get_frames(df, frame_size, hop_size)
print("X shape:", X.shape)
print("라벨 분포(프레임 기준):", Counter(y_raw))

X shape: (27160, 80, 3)
라벨 분포(프레임 기준): Counter({np.str_('Walking'): 10462, np.str_('Jogging'): 8412, np.str_('Upstairs'): 3068, np.str_('Downstairs'): 2513, np.str_('Sitting'): 1494, np.str_('Standing'): 1211})


In [35]:
# ================================================
# 2-1) Subject-wise wrapper (추가)
# ================================================
def run_subjectwise(get_frames_func, df, group_col, frame_size, hop_size):
    X_list, y_list = [], []
    for _, g in df.groupby(group_col, sort=False):
        # subject 내부를 시간 순으로 정렬(권장)
        g = g.sort_values("timestamp")  # FIX: 안정적 시계열
        Xg, yg = get_frames_func(g, frame_size, hop_size)
        if len(yg) == 0:
            continue
        X_list.append(Xg)
        y_list.append(yg)
    if not X_list:
        return np.empty((0, frame_size, N_FEATURES)), np.array([])
    return np.vstack(X_list), np.concatenate(y_list)

In [36]:
# ================================================
# 3) Subject-wise Train/Test Split
# ================================================
# FIX: 컬럼명 subject 사용
df_train = df[df["subject"] <= 30].copy()   # FIX
df_test  = df[df["subject"] >  30].copy()   # FIX

X_train, y_train_raw = run_subjectwise(get_frames, df_train, "subject", frame_size, hop_size)  # FIX
X_test,  y_test_raw  = run_subjectwise(get_frames, df_test,  "subject", frame_size, hop_size)  # FIX

print("X_train:", X_train.shape, " / X_test:", X_test.shape)
print("Train 라벨 분포:", Counter(y_train_raw))
print("Test  라벨 분포:", Counter(y_test_raw))

X_train: (22121, 80, 3)  / X_test: (4987, 80, 3)
Train 라벨 분포: Counter({np.str_('Walking'): 8665, np.str_('Jogging'): 6749, np.str_('Upstairs'): 2555, np.str_('Downstairs'): 2064, np.str_('Sitting'): 1145, np.str_('Standing'): 943})
Test  라벨 분포: Counter({np.str_('Walking'): 1770, np.str_('Jogging'): 1669, np.str_('Upstairs'): 498, np.str_('Downstairs'): 444, np.str_('Sitting'): 354, np.str_('Standing'): 252})


In [37]:
# ================================================
# 4) 라벨 인코딩 (전체 라벨 집합 기준)
# ================================================
le = LabelEncoder()
le.fit(df["label"])
y_train = le.transform(y_train_raw)
y_test  = le.transform(y_test_raw)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))

y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat  = to_categorical(y_test,  num_classes=num_classes)

Classes: ['Downstairs', 'Jogging', 'Sitting', 'Standing', 'Upstairs', 'Walking']


In [38]:
# ================================================
# 5) 표준화 (train으로만 fit)
# ================================================
scaler = StandardScaler()
X_train_2d = X_train.reshape(-1, X_train.shape[-1])
X_test_2d  = X_test.reshape(-1,  X_test.shape[-1])

scaler.fit(X_train_2d)
X_train = scaler.transform(X_train_2d).reshape(X_train.shape)
X_test  = scaler.transform(X_test_2d ).reshape(X_test.shape)

print("Shapes ->",
      "X_train:", X_train.shape,
      "X_test:", X_test.shape,
      "y_train:", y_train_cat.shape,
      "y_test:", y_test_cat.shape)

Shapes -> X_train: (22121, 80, 3) X_test: (4987, 80, 3) y_train: (22121, 6) y_test: (4987, 6)


In [39]:
# ================================================
# 6) CNN 모델 정의
# ================================================
def build_cnn(input_shape, num_classes):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=5, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization()); model.add(Dropout(0.5)); model.add(MaxPooling1D(pool_size=2))

    model.add(Conv1D(128, kernel_size=5, activation='relu'))
    model.add(BatchNormalization()); model.add(Dropout(0.5)); model.add(MaxPooling1D(pool_size=2))

    model.add(Flatten())
    model.add(Dense(128, activation='relu')); model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

model = build_cnn((frame_size, N_FEATURES), num_classes)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [40]:
# ================================================
# 7) 학습
# ================================================
# one-hot 라벨 사용 (y_train_cat)
history = model.fit(
    X_train, y_train_cat,
    validation_split=0.2,
    epochs=30,
    batch_size=256,
    verbose=1
)

Epoch 1/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 64ms/step - accuracy: 0.5949 - loss: 1.5242 - val_accuracy: 0.1903 - val_loss: 4.6130
Epoch 2/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7823 - loss: 0.5942 - val_accuracy: 0.1577 - val_loss: 7.8487
Epoch 3/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8043 - loss: 0.5195 - val_accuracy: 0.1826 - val_loss: 7.8798
Epoch 4/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8320 - loss: 0.4532 - val_accuracy: 0.2603 - val_loss: 4.9466
Epoch 5/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8566 - loss: 0.3874 - val_accuracy: 0.4450 - val_loss: 2.7523
Epoch 6/30
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8746 - loss: 0.3414 - val_accuracy: 0.5989 - val_loss: 2.0245
Epoch 7/30
[1m70/70[0m [32m━━━━━━━━

In [41]:
# ================================================
# 8) 평가
# ================================================
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=0)
print(f"[Subject-wise Test] loss={test_loss:.4f}  acc={test_acc:.4f}")

y_pred = np.argmax(model.predict(X_test, verbose=0), axis=1)
y_true = np.argmax(y_test_cat, axis=1)

print("\nClassification Report (Subject-wise)")
print(classification_report(y_true, y_pred, target_names=list(le.classes_)))

cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix (rows=true, cols=pred):\n", cm)

[Subject-wise Test] loss=0.9328  acc=0.9015

Classification Report (Subject-wise)
              precision    recall  f1-score   support

  Downstairs       0.65      0.84      0.73       444
     Jogging       1.00      0.88      0.93      1669
     Sitting       0.84      0.99      0.91       354
    Standing       0.91      0.74      0.81       252
    Upstairs       0.85      0.78      0.81       498
     Walking       0.93      0.98      0.95      1770

    accuracy                           0.90      4987
   macro avg       0.86      0.87      0.86      4987
weighted avg       0.91      0.90      0.90      4987


Confusion Matrix (rows=true, cols=pred):
 [[ 371    2    1    1   25   44]
 [ 118 1463    0    0   38   50]
 [   0    0  352    1    1    0]
 [   0    0   66  186    0    0]
 [  52    0    1   17  388   40]
 [  30    1    0    0    3 1736]]
