In [1]:
!pip install numpy pandas scikit-learn tensorflow scipy



In [7]:
import os, re, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [8]:
# =========================================================
# 1) TXT 로드 (깨진 줄은 skip) → 기본 전처리
# =========================================================

cols = ["subject", "label", "timestamp", "x", "y", "z"]

df = pd.read_csv(
    "/content/drive/MyDrive/data/WISDM_ar_v1.1_raw.txt",
    header=None,
    names=cols,
    on_bad_lines="skip"   # 깨진 줄은 무시
).dropna()

# z 컬럼 끝 세미콜론 제거 + float 변환
df["z"] = df["z"].astype(str).str.replace(";", "", regex=False).astype(float)

# x, y도 혹시 모르니 float 변환
df["x"] = df["x"].astype(float)
df["y"] = df["y"].astype(float)


df

Unnamed: 0,subject,label,timestamp,x,y,z
0,33,Jogging,49105962326000,-0.694638,12.680544,0.503953
1,33,Jogging,49106062271000,5.012288,11.264028,0.953424
2,33,Jogging,49106112167000,4.903325,10.882658,-0.081722
3,33,Jogging,49106222305000,-0.612916,18.496431,3.023717
4,33,Jogging,49106332290000,-1.184970,12.108489,7.205164
...,...,...,...,...,...,...
1086461,19,Sitting,131623331483000,9.000000,-1.570000,1.690000
1086462,19,Sitting,131623371431000,9.040000,-1.460000,1.730000
1086463,19,Sitting,131623411592000,9.080000,-1.380000,1.690000
1086464,19,Sitting,131623491487000,9.000000,-1.460000,1.730000


In [9]:
# 타입 캐스팅 + 정렬
df["subject"] = pd.to_numeric(df["subject"], errors="coerce").astype("Int64")
df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")
for c in ["x","y","z"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna().sort_values(["subject","label","timestamp"]).reset_index(drop=True)

print("샘플:\n", df.head(), "\n")
print("고유 subject:", df["subject"].nunique())
print("라벨 분포:", Counter(df["label"]))

샘플:
    subject       label      timestamp     x      y     z
0        1  Downstairs  6552942304000 -0.15   9.15 -0.34
1        1  Downstairs  6552992292000  0.11   9.19  2.76
2        1  Downstairs  6553042310000 -4.06   7.40  4.02
3        1  Downstairs  6553092298000 -2.87   7.93  3.21
4        1  Downstairs  6553142347000 -0.19  10.04  4.82 

고유 subject: 36
라벨 분포: Counter({'Walking': 418393, 'Jogging': 336445, 'Upstairs': 122869, 'Downstairs': 100425, 'Sitting': 59939, 'Standing': 48394})


In [10]:
# =========================================================
# 2) 슬라이딩 윈도우로 프레임 생성 (subject별로 끊어서)
# =========================================================
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras import layers, models

Fs = 20
frame_size = Fs * 4   # 80 (4초)
hop_size   = Fs * 2   # 40 (50% 겹침)

def make_windows_subjectwise(df, frame_size, hop_size):
    """
    df: ['subject','label','timestamp','x','y','z'] 포함
    return:
      X: (N, T, 3) float32
      y: (N,)       원본 라벨 문자열
      groups: (N,)  subject ID (윈도우 단위)
    """
    X_list, y_list, g_list = [], [], []
    for subj, subdf in df.groupby("subject"):
        xv = subdf["x"].to_numpy()
        yv = subdf["y"].to_numpy()
        zv = subdf["z"].to_numpy()
        lv = subdf["label"].to_numpy()

        n = len(subdf)
        for i in range(0, n - frame_size, hop_size):
            x = xv[i:i+frame_size]
            y = yv[i:i+frame_size]
            z = zv[i:i+frame_size]
            seg_labels = lv[i:i+frame_size]

            # 프레임 대표 라벨 = 최빈값
            label = pd.Series(seg_labels).value_counts().idxmax()

            X_list.append(np.stack([x, y, z], axis=1))  # (T, 3)
            y_list.append(label)
            g_list.append(int(subj))

    X = np.asarray(X_list, dtype=np.float32)       # (N, T, 3)
    y = np.asarray(y_list)
    groups = np.asarray(g_list)
    return X, y, groups

X, y_raw, groups = make_windows_subjectwise(df, frame_size, hop_size)
print("X shape:", X.shape)  # (N, 80, 3)
print("라벨 분포(프레임 기준):", Counter(y_raw))

X shape: (27108, 80, 3)
라벨 분포(프레임 기준): Counter({np.str_('Walking'): 10420, np.str_('Jogging'): 8412, np.str_('Upstairs'): 3075, np.str_('Downstairs'): 2495, np.str_('Sitting'): 1492, np.str_('Standing'): 1214})


In [11]:
# =========================================================
# 3) 라벨 인코딩 및 그룹 기반 Train/Test 분할(누수 방지)
# =========================================================
le = LabelEncoder()
y = le.fit_transform(y_raw)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))

# 사람(그룹) 단위로 분할
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

Classes: [np.str_('Downstairs'), np.str_('Jogging'), np.str_('Sitting'), np.str_('Standing'), np.str_('Upstairs'), np.str_('Walking')]


In [12]:
# =========================================================
# 4) 스케일링(Train으로 fit → Train/Test transform)
#    채널(x,y,z)별 표준화: (N,T,3) -> (N*T,3)로 펴서 진행
# =========================================================
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_2d = X_train.reshape(-1, X_train.shape[-1])  # (N*T, 3)
X_test_2d  = X_test.reshape(-1,  X_test.shape[-1])

scaler.fit(X_train_2d)
X_train = scaler.transform(X_train_2d).reshape(X_train.shape)
X_test  = scaler.transform(X_test_2d ).reshape(X_test.shape)

In [13]:
# =========================================================
# 5) 1D-CNN 모델 정의
# =========================================================
def build_1d_cnn(input_shape, num_classes):
    inp = layers.Input(shape=input_shape)  # (T, 3)

    x = layers.Conv1D(64, 5, padding='same', activation='relu')(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(128, 5, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Dropout(0.2)(x)

    x = layers.Conv1D(128, 3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(256, 3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)

    out = layers.Dense(num_classes, activation='softmax')(x)

    model = models.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

model = build_1d_cnn(input_shape=(frame_size, 3), num_classes=num_classes)
model.summary()

In [19]:
# =========================================================
# 6) 학습
# =========================================================

history = model.fit(
    X_train, y_train,
    validation_split=0.2,  # train 안에서 검증 분리
    epochs=50,
    batch_size=256,
    verbose=1
)

Epoch 1/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9987 - loss: 0.0052 - val_accuracy: 0.7971 - val_loss: 1.9793
Epoch 2/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9990 - loss: 0.0038 - val_accuracy: 0.8074 - val_loss: 2.2954
Epoch 3/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9991 - loss: 0.0043 - val_accuracy: 0.8113 - val_loss: 2.1916
Epoch 4/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9990 - loss: 0.0052 - val_accuracy: 0.7878 - val_loss: 2.1316
Epoch 5/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9989 - loss: 0.0039 - val_accuracy: 0.7829 - val_loss: 2.1767
Epoch 6/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9991 - loss: 0.0036 - val_accuracy: 0.8074 - val_loss: 1.8434
Epoch 7/50
[1m64/64[0m [32m━━━━

In [20]:

# =========================================================
# 7) 평가
# =========================================================
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"[Test] loss={test_loss:.4f}  acc={test_acc:.4f}")

y_pred = np.argmax(model.predict(X_test, verbose=0), axis=1)

print("\nClassification Report")
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix (rows=true, cols=pred):\n", cm)

[Test] loss=1.3829  acc=0.8746

Classification Report
              precision    recall  f1-score   support

  Downstairs       0.72      0.79      0.76       706
     Jogging       0.97      0.92      0.94      2128
     Sitting       0.98      0.81      0.89       273
    Standing       0.73      1.00      0.84       238
    Upstairs       0.71      0.83      0.76      1030
     Walking       0.95      0.88      0.91      2278

    accuracy                           0.87      6653
   macro avg       0.84      0.87      0.85      6653
weighted avg       0.89      0.87      0.88      6653


Confusion Matrix (rows=true, cols=pred):
 [[ 560    5    0    2   99   40]
 [  75 1951    3    0   53   46]
 [   0    0  221   52    0    0]
 [   0    0    0  237    1    0]
 [  64   52    0   31  854   29]
 [  78    2    2    2  198 1996]]
