In [43]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import cv2
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# MoveNet 모델 로드
model = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")
movenet = model.signatures['serving_default']

Exception ignored in: <function tqdm.__del__ at 0x2de3c19e0>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/capstone/lib/python3.12/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/opt/anaconda3/envs/capstone/lib/python3.12/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


In [81]:
def run_inference(movenet, image):
    """이미지에서 포즈 키포인트를 추출하는 함수"""
    input_image = tf.image.resize_with_pad(tf.expand_dims(image, axis=0), 192, 192)
    input_image = tf.cast(input_image, dtype=tf.int32)
    results = movenet(input_image)
    keypoints_with_scores = results['output_0'].numpy()
    return keypoints_with_scores

def extract_keypoints_from_video(video_path):
    """동영상에서 프레임별로 포즈 키포인트를 추출하는 함수"""
    cap = cv2.VideoCapture(video_path)
    keypoints_list = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        keypoints = run_inference(movenet, frame_rgb)
        keypoints_list.append(keypoints.flatten())
    cap.release()
    return np.mean(keypoints_list, axis=0)

def calculate_distance(keypoint1, keypoint2):
    """두 키포인트 간의 거리를 계산합니다."""
    return np.linalg.norm(keypoint1 - keypoint2)

def calculate_angle(keypoint1, keypoint2, keypoint3):
    """세 키포인트를 사용하여 각도를 계산합니다."""
    vector1 = keypoint1 - keypoint2
    vector2 = keypoint3 - keypoint2
    unit_vector1 = vector1 / np.linalg.norm(vector1)
    unit_vector2 = vector2 / np.linalg.norm(vector2)
    dot_product = np.dot(unit_vector1, unit_vector2)
    angle = np.arccos(dot_product)
    return np.degrees(angle)


In [125]:
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler

# 동영상이 저장된 디렉토리 경로
video_root = '/Users/diana/Downloads/BabyPose-main/data'
video_dirs = ['arching_back', 'head_banging', 'kicking_legs', 'rubbing_eye', 'stretching', 'sucking_fingers']

# 특성 및 레이블 데이터 준비
features = []
labels = []

# 동영상 파일 처리
for label, class_dir in enumerate(video_dirs):
    class_path = os.path.join(video_root, class_dir)
    video_files = [f for f in os.listdir(class_path) if f.endswith('.mp4')]
    for video_file in tqdm.tqdm(video_files, desc=f'Processing {class_dir}'):
        video_path = os.path.join(class_path, video_file)
        # 여기에서 `extract_keypoints_from_video` 함수를 사용하여 동영상에서 키포인트를 추출
        keypoints = extract_keypoints_from_video(video_path)  # 이 함수의 구현은 제공되지 않았으니, 가정하에 추가하세요.
        
        # # 추가 특성 계산
        # if len(keypoints) >= 3:  # 적어도 3개의 키포인트가 필요
        #     distance = calculate_distance(keypoints[0], keypoints[1])
        #     angle = calculate_angle(keypoints[0], keypoints[1], keypoints[2])
        #     extended_features = np.append(keypoints.flatten(), [distance, angle])
        # else:
        #     extended_features = keypoints.flatten()
        
        features.append(keypoints)
        labels.append(label)

Processing arching_back: 100%|██████████| 19/19 [00:21<00:00,  1.14s/it]
Processing head_banging: 100%|██████████| 22/22 [00:49<00:00,  2.24s/it]
Processing kicking_legs: 100%|██████████| 23/23 [00:54<00:00,  2.37s/it]
Processing rubbing_eye: 100%|██████████| 26/26 [00:59<00:00,  2.31s/it]
Processing stretching: 100%|██████████| 23/23 [00:48<00:00,  2.11s/it]
Processing sucking_fingers: 100%|██████████| 32/32 [01:33<00:00,  2.93s/it]


LSTM 전용 키포인트 추출

In [112]:
def extract_keypoints_sequences_from_video(video_path, num_frames=None):
    """동영상에서 프레임별로 포즈 키포인트를 추출하여 시퀀스 형태로 반환하는 함수"""
    cap = cv2.VideoCapture(video_path)
    keypoints_sequences = []
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or (num_frames is not None and frame_count >= num_frames):
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        keypoints = run_inference(movenet, frame_rgb)
        # 여기서는 flatten 대신 원래 형태를 유지합니다.
        keypoints_sequences.append(keypoints[0][0][:, :2])  # 스코어 제외, x와 y 위치만 사용
        frame_count += 1
    cap.release()
    # 시퀀스의 길이를 맞추기 위해 필요한 경우 여기에서 패딩을 추가할 수 있습니다.
    # num_frames 매개변수를 사용하여 모든 비디오에서 동일한 프레임 수를 추출하도록 할 수 있습니다.
    return np.array(keypoints_sequences)

In [113]:
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler

# 동영상이 저장된 디렉토리 경로
video_root = '/Users/diana/Downloads/BabyPose-main/data'
video_dirs = ['arching_back', 'head_banging', 'kicking_legs', 'rubbing_eye', 'stretching', 'sucking_fingers']

# 데이터 준비
features = []  # 이제 각 항목은 비디오당 한 시퀀스의 키포인트를 포함하는 3차원 배열입니다.
labels = []

# 동영상 파일 처리
for label, class_dir in enumerate(video_dirs):
    class_path = os.path.join(video_root, class_dir)
    video_files = [f for f in os.listdir(class_path) if f.endswith('.mp4')]
    for video_file in tqdm.tqdm(video_files, desc=f'Processing {class_dir}'):
        video_path = os.path.join(class_path, video_file)
        keypoints_sequences = extract_keypoints_sequences_from_video(video_path)  # 수정된 함수 사용
        features.append(keypoints_sequences)
        labels.append(label)

Processing arching_back: 100%|██████████| 19/19 [00:21<00:00,  1.14s/it]
Processing head_banging: 100%|██████████| 22/22 [00:51<00:00,  2.32s/it]
Processing kicking_legs: 100%|██████████| 23/23 [00:54<00:00,  2.35s/it]
Processing rubbing_eye: 100%|██████████| 26/26 [00:59<00:00,  2.31s/it]
Processing stretching: 100%|██████████| 23/23 [00:48<00:00,  2.13s/it]
Processing sucking_fingers: 100%|██████████| 32/32 [01:32<00:00,  2.88s/it]


In [114]:
print(features)

[array([[[0.44272885, 0.33554015],
        [0.41049346, 0.32631755],
        [0.45102617, 0.30742887],
        ...,
        [0.6320247 , 0.5685481 ],
        [0.6248791 , 0.7681148 ],
        [0.6585482 , 0.75793326]],

       [[0.43831962, 0.33644816],
        [0.40898395, 0.3267644 ],
        [0.45022783, 0.30898643],
        ...,
        [0.6283904 , 0.5703335 ],
        [0.6207313 , 0.77002156],
        [0.6532402 , 0.7583997 ]],

       [[0.4439035 , 0.3352128 ],
        [0.41243824, 0.32592237],
        [0.45460635, 0.30802187],
        ...,
        [0.6378695 , 0.5715963 ],
        [0.62588763, 0.77807236],
        [0.6563987 , 0.75291985]],

       ...,

       [[0.39331004, 0.34591186],
        [0.36040407, 0.35241956],
        [0.3943363 , 0.3050124 ],
        ...,
        [0.74767435, 0.7889773 ],
        [0.675476  , 0.9546432 ],
        [0.74071926, 0.80021566]],

       [[0.3953006 , 0.3456365 ],
        [0.36239573, 0.3507833 ],
        [0.39698303, 0.30468398],
        

In [124]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리 과정
# features와 labels가 정의되어 있음을 가정

# 각 샘플의 (x, y) 좌표만 사용하여 특성을 재구성하고 필요한 경우 패딩 적용
features_padded = pad_sequences(features, padding='post', dtype='float32', value=0.0)

# 패딩된 결과 확인 및 데이터 차원 변경
nsamples, nx, ny, _ = features_padded.shape
features_reshaped = features_padded.reshape(nsamples, nx, -1)  # ny*2 차원으로 변경 (34 features per timestep)

# 데이터 스케일링
features_flattened = features_reshaped.reshape(nsamples, nx*ny*2)  # Flatten for scaling
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_flattened)
features_scaled = features_scaled.reshape(nsamples, nx, ny*2)  # Reshape back to 3D

# 레이블 배열로 변환
labels = np.array(labels)

# KFold 크로스 밸리데이션 설정
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = []

# KFold 크로스 밸리데이션 실행
for train_index, test_index in kfold.split(features_scaled):
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # LSTM 모델 구성
    model = Sequential([
    # 첫 번째 LSTM 레이어; return_sequences=False이므로 마지막 타임스텝의 출력만 반환
    LSTM(32, input_shape=(nx, ny*2)),  # nx와 ny*2는 실제 데이터 차원에 맞게 설정
    # Dense 레이어
    Dense(32, activation='relu'),
    # 두 번째 Dense 레이어
    Dense(32, activation='relu'),
    # 출력 레이어, num_classes는 분류하고자 하는 클래스의 수
    Dense(6, activation='softmax')
    ])

    # 모델 컴파일 및 학습
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=40, batch_size=8, verbose=0)

    # 모델 평가
    _, accuracy = model.evaluate(X_test, y_test, verbose=0)
    scores.append(accuracy)

# 평균 정확도 출력
average_accuracy = np.mean(scores)
print(f'K-Fold Cross-Validation Average Accuracy: {average_accuracy}')


  super().__init__(**kwargs)


K-Fold Cross-Validation Average Accuracy: 0.220689657330513


레이어 쌓아서 해보기~

In [171]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
features = np.array(features)  # features는 이전 단계에서 추출한 특성 데이터
labels = np.array(labels)  # labels는 해당 특성 데이터의 레이블

scaler = StandardScaler()
features = scaler.fit_transform(features)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# 모델 구성
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(len(np.unique(labels)), activation='softmax')
])

optimizer = Adam(learning_rate=0.0001)

model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=2)

# 모델 평가
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# 성능 지표 계산
accuracy = accuracy_score(y_test, y_pred_classes)
f1 = f1_score(y_test, y_pred_classes, average='weighted')

print(f'Test set accuracy: {accuracy}')
print(f'Test set F1 Score: {f1}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


3/3 - 1s - 455ms/step - accuracy: 0.1630 - loss: 2.5803 - val_accuracy: 0.1250 - val_loss: 1.8103
Epoch 2/100
3/3 - 0s - 9ms/step - accuracy: 0.1848 - loss: 2.5190 - val_accuracy: 0.2083 - val_loss: 1.8006
Epoch 3/100
3/3 - 0s - 12ms/step - accuracy: 0.2500 - loss: 2.3515 - val_accuracy: 0.2083 - val_loss: 1.7919
Epoch 4/100
3/3 - 0s - 7ms/step - accuracy: 0.2935 - loss: 2.2449 - val_accuracy: 0.1250 - val_loss: 1.7851
Epoch 5/100
3/3 - 0s - 8ms/step - accuracy: 0.2391 - loss: 2.4470 - val_accuracy: 0.0833 - val_loss: 1.7783
Epoch 6/100
3/3 - 0s - 8ms/step - accuracy: 0.3478 - loss: 1.9435 - val_accuracy: 0.1667 - val_loss: 1.7718
Epoch 7/100
3/3 - 0s - 11ms/step - accuracy: 0.2717 - loss: 2.1272 - val_accuracy: 0.1667 - val_loss: 1.7668
Epoch 8/100
3/3 - 0s - 8ms/step - accuracy: 0.3478 - loss: 2.1397 - val_accuracy: 0.1667 - val_loss: 1.7592
Epoch 9/100
3/3 - 0s - 8ms/step - accuracy: 0.3587 - loss: 1.9350 - val_accuracy: 0.1667 - val_loss: 1.7536
Epoch 10/100
3/3 - 0s - 8ms/step - a

In [173]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
features = np.array(features)  # features는 이전 단계에서 추출한 특성 데이터
labels = np.array(labels)  # labels는 해당 특성 데이터의 레이블

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.2, random_state=42)

# 모델 구성
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(len(np.unique(labels)), activation='softmax')
])

# SGD 최적화 알고리즘 사용
optimizer = SGD(learning_rate=0.01, momentum=0.9)

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early Stopping과 Learning Rate Scheduler 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

# 모델 학습
history = model.fit(
    X_train, y_train, 
    epochs=100, 
    batch_size=64, 
    validation_split=0.2, 
    callbacks=[early_stopping, reduce_lr], 
    verbose=2
)

# 모델 평가
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test set accuracy: {accuracy}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


2/2 - 1s - 278ms/step - accuracy: 0.1522 - loss: 2.8259 - val_accuracy: 0.2083 - val_loss: 1.9390 - learning_rate: 0.0100
Epoch 2/100
2/2 - 0s - 10ms/step - accuracy: 0.2174 - loss: 2.2437 - val_accuracy: 0.2500 - val_loss: 1.8358 - learning_rate: 0.0100
Epoch 3/100
2/2 - 0s - 12ms/step - accuracy: 0.3370 - loss: 2.0301 - val_accuracy: 0.2500 - val_loss: 1.7253 - learning_rate: 0.0100
Epoch 4/100
2/2 - 0s - 11ms/step - accuracy: 0.4348 - loss: 1.6201 - val_accuracy: 0.3750 - val_loss: 1.6299 - learning_rate: 0.0100
Epoch 5/100
2/2 - 0s - 10ms/step - accuracy: 0.5652 - loss: 1.4256 - val_accuracy: 0.5417 - val_loss: 1.5452 - learning_rate: 0.0100
Epoch 6/100
2/2 - 0s - 11ms/step - accuracy: 0.7500 - loss: 1.0131 - val_accuracy: 0.5000 - val_loss: 1.4813 - learning_rate: 0.0100
Epoch 7/100
2/2 - 0s - 10ms/step - accuracy: 0.6630 - loss: 1.0921 - val_accuracy: 0.5417 - val_loss: 1.4285 - learning_rate: 0.0100
Epoch 8/100
2/2 - 0s - 10ms/step - accuracy: 0.7500 - loss: 1.0623 - val_accurac

SVM

In [194]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
features = np.array(features)  # features는 이전 단계에서 추출한 특성 데이터
labels = np.array(labels)  # labels는 해당 특성 데이터의 레이블

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'C': [0.1, 1, 10, 100],  # C: 규제 매개변수
    'gamma': [1, 0.1, 0.01, 0.001],  # gamma: 커널의 계수
    'kernel': ['rbf', 'poly', 'sigmoid']  # kernel: 사용할 커널
}

# SVM 모델 초기화
svm = SVC(random_state=42)

# 그리드 서치와 스트래티파이드 k-폴드 교차 검증 설정
cv = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=cv, verbose=2, n_jobs=-1)

# 그리드 서치 실행
grid_search.fit(features_scaled, labels)

# 최적의 파라미터와 그 때의 점수 출력
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

# 최적의 모델로 평가
best_model = grid_search.best_estimator_


Fitting 9 folds for each of 48 candidates, totalling 432 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1

CatBoost

랜덤서치

In [206]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform
import numpy as np

# 데이터 준비 및 전처리
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 하이퍼파라미터 분포 설정 (다중 클래스 분류 설정 포함)
param_distributions = {
    'iterations': randint(100, 500),
    'learning_rate': uniform(0.01, 0.1),
    'depth': randint(3, 6),
    'l2_leaf_reg': randint(1, 5),
}

# CatBoost 모델 초기화 (다중 클래스 분류 설정)
catboost_model = CatBoostClassifier(
    loss_function='MultiClass',  # 다중 클래스 분류 문제 설정
    verbose=False,  # 랜덤 서치 중 메시지 출력 억제
    random_state=42
)

# StratifiedKFold 설정
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# 랜덤 서치 설정
random_search = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=param_distributions,
    n_iter=10,  # 평가할 하이퍼파라미터 조합 수
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# 랜덤 서치 실행
random_search.fit(features_scaled, labels)

# 최적의 파라미터와 그 때의 점수 출력
print(f'Best parameters: {random_search.best_params_}')
print(f'Best cross-validation score: {random_search.best_score_}')

# 최적의 모델로 평가
best_model = random_search.best_estimator_


Fitting 4 folds for each of 10 candidates, totalling 40 fits
[CV] END depth=3, iterations=120, l2_leaf_reg=3, learning_rate=0.05458327528535912; total time=   0.6s
[CV] END depth=3, iterations=120, l2_leaf_reg=3, learning_rate=0.05458327528535912; total time=   0.6s
[CV] END depth=3, iterations=120, l2_leaf_reg=3, learning_rate=0.05458327528535912; total time=   0.6s
[CV] END depth=3, iterations=120, l2_leaf_reg=3, learning_rate=0.05458327528535912; total time=   0.7s
[CV] END depth=5, iterations=448, l2_leaf_reg=3, learning_rate=0.0831993941811405; total time=   6.6s
[CV] END depth=5, iterations=448, l2_leaf_reg=3, learning_rate=0.0831993941811405; total time=   6.7s
[CV] END depth=5, iterations=448, l2_leaf_reg=3, learning_rate=0.0831993941811405; total time=   6.7s
[CV] END depth=5, iterations=448, l2_leaf_reg=3, learning_rate=0.0831993941811405; total time=   6.9s
[CV] END depth=5, iterations=430, l2_leaf_reg=3, learning_rate=0.09661761457749352; total time=   6.6s
[CV] END depth=5

그리드서치

In [198]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 하이퍼파라미터 그리드 설정 (다중 클래스 분류 설정 포함)
param_grid = {
    'iterations': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 4, 6],
    'l2_leaf_reg': [1, 3, 5],
    # 'loss_function': ['MultiClass']  # CatBoost의 파라미터로 직접 설정하는 대신 CatBoostClassifier에서 설정
}

# CatBoost 모델 초기화 (다중 클래스 분류 설정)
catboost_model = CatBoostClassifier(
    loss_function='MultiClass',  # 다중 클래스 분류 문제 설정
    verbose=False,  # 그리드 서치 중 메시지 출력 억제
    random_state=42
)

# StratifiedKFold 설정
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# 그리드 서치 설정
grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# 그리드 서치 실행
grid_search.fit(features_scaled, labels)

# 최적의 파라미터와 그 때의 점수 출력
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

# 최적의 모델로 평가
best_model = grid_search.best_estimator_


Fitting 10 folds for each of 81 candidates, totalling 810 fits
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.5s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.5s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.5s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.4s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.5s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.5s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.5s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.6s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.01; total time=   0.5s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning_rate=0.05; total time=   0.4s
[CV] END depth=3, iterations=100, l2_leaf_reg=1, learning

H2O

In [210]:
import h2o
from h2o.automl import H2OAutoML
import numpy as np
import pandas as pd

h2o.init()

# 데이터 준비 및 전처리
df = pd.DataFrame(features)
df['label'] = labels  # labels 열을 DataFrame에 추가합니다.

# H2OFrame으로 변환
hf = h2o.H2OFrame(df)

# 목표 변수 및 독립 변수 설정
y = 'label'
x = hf.columns
x.remove(y)

# 훈련 데이터셋과 검증 데이터셋 분할
train, valid = hf.split_frame(ratios=[.8], seed=1)

# AutoML 실행
aml = H2OAutoML(max_models=20, seed=1, max_runtime_secs=300, exclude_algos = ["StackedEnsemble"], nfolds=0)
aml.train(x=x, y=y, training_frame=train, validation_frame=valid)

# 리더보드 출력
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

# 최고 성능 모델 저장
model_path = h2o.save_model(model=aml.leader, path="/Users/diana/Desktop/capstone2_SEDA", force=True)
print("Model saved to: " + model_path)

# 테스트 데이터셋에 대한 예측 및 정확도 계산 (검증 데이터셋 사용)
preds = aml.leader.predict(valid)
accuracy = h2o.accuracy(y=valid[y], preds=preds["predict"])
print(f'Accuracy: {accuracy}')

# H2O 종료 (선택적)
# h2o.shutdown(prompt=False)


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,10 mins 14 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,2 months and 19 days
H2O_cluster_name:,H2O_from_python_diana_45zf1a
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.954 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
16:27:18.933: AutoML: XGBoost is not available; skipping it.
16:27:19.42: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 122.0.

███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                  rmse       mse       mae     rmsle    mean_residual_deviance
DeepLearning_grid_2_AutoML_2_20240311_162718_model_1  0.970062  0.94102   0.638553  0.335088                  0.94102
GBM_3_AutoML_2_20240311_162718                        0.978639  0.957734  0.713454  0.372414                  0.957734
GBM_grid_1_AutoML_2_20240311_162718_model_4           0.986851  0.973875  0.799203  0.360765                  0.973875
GBM_grid_1_AutoML_2_20240311_162718_model_1           0.991751  0.98357   0.671134  0.382348                  0.98357
GB

AttributeError: module 'h2o' has no attribute 'accuracy'

랜덤포레스트 활용

In [154]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
features = np.array(features)  # features는 이전 단계에서 추출한 특성 데이터
labels = np.array(labels)  # labels는 해당 특성 데이터의 레이블

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# k-폴드 크로스 밸리데이션 설정
kfold = KFold(n_splits=12, shuffle=True, random_state=42)

# 점수 기록
scores = []

# k-폴드 크로스 밸리데이션 실행
for train_index, test_index in kfold.split(features_scaled, labels):
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # 랜덤 포레스트 모델 구성 및 학습
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # 모델 평가
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Test accuracy: {accuracy}')
    scores.append(accuracy)

# 평균 정확도 계산
average_accuracy = np.mean(scores)
print(f'\nK-Fold Cross-Validation Average Accuracy: {average_accuracy}')

Test accuracy: 0.7692307692307693
Test accuracy: 0.75
Test accuracy: 0.75
Test accuracy: 0.75
Test accuracy: 0.75
Test accuracy: 0.9166666666666666
Test accuracy: 0.9166666666666666
Test accuracy: 0.6666666666666666
Test accuracy: 0.75
Test accuracy: 0.8333333333333334
Test accuracy: 0.75
Test accuracy: 0.6666666666666666

K-Fold Cross-Validation Average Accuracy: 0.7724358974358974


In [157]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
# features와 labels가 이미 정의되어 있다고 가정

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 랜덤 포레스트 모델 초기화
rf = RandomForestClassifier(random_state=42)

# 그리드 서치와 k-폴드 교차 검증 설정
cv = KFold(n_splits=12, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, verbose=2, n_jobs=-1)

# 그리드 서치 실행
grid_search.fit(features_scaled, labels)

# 최적의 파라미터와 그 때의 점수 출력
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

# 최적의 모델로 평가
best_model = grid_search.best_estimator_


Fitting 12 folds for each of 108 candidates, totalling 1296 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_dept

In [168]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
# features와 labels가 이미 정의되어 있다고 가정

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 하이퍼파라미터 그리드 설정
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

param_grid = {
    'n_estimators': [100, 200, 300],  # 트리의 수를 늘려 정확도 개선
    'max_depth': [None, 20, 40, 60],  # 깊이 추가
    'min_samples_split': [2, 5],  # 작은 조정으로 테스트
    'min_samples_leaf': [1, 2, 4],  # 기존 범위 유지
    'max_features': ['sqrt', 'log2', None]  # 특성의 수 제한 옵션 추가
}


# 랜덤 포레스트 모델 초기화
rf = RandomForestClassifier(random_state=42)

# 그리드 서치와 스트래티파이드 k-폴드 교차 검증 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, verbose=2, n_jobs=-1)

# 그리드 서치 실행
grid_search.fit(features_scaled, labels)

# 최적의 파라미터와 그 때의 점수 출력
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

# 최적의 모델로 평가
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=

Xgboost

In [155]:
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
features = np.array(features)  # features는 이전 단계에서 추출한 특성 데이터
labels = np.array(labels)  # labels는 해당 특성 데이터의 레이블

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# k-폴드 크로스 밸리데이션 설정
kfold = KFold(n_splits=12, shuffle=True, random_state=42)

# 점수 기록
scores = []

# k-폴드 크로스 밸리데이션 실행
for train_index, test_index in kfold.split(features_scaled, labels):
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # XGBoost 모델 구성 및 학습
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_model.fit(X_train, y_train)

    # 모델 평가
    y_pred = xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Test accuracy: {accuracy}')
    scores.append(accuracy)

# 평균 정확도 계산
average_accuracy = np.mean(scores)
print(f'\nK-Fold Cross-Validation Average Accuracy: {average_accuracy}')


Test accuracy: 0.7692307692307693
Test accuracy: 0.5833333333333334
Test accuracy: 0.6666666666666666
Test accuracy: 0.5833333333333334
Test accuracy: 0.75
Test accuracy: 0.75
Test accuracy: 0.5833333333333334
Test accuracy: 0.5833333333333334
Test accuracy: 0.75
Test accuracy: 0.6666666666666666
Test accuracy: 0.75
Test accuracy: 0.5

K-Fold Cross-Validation Average Accuracy: 0.6613247863247864


LightGBM

In [156]:
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# 데이터 준비 및 전처리
features = np.array(features)  # features는 이전 단계에서 추출한 특성 데이터
labels = np.array(labels)  # labels는 해당 특성 데이터의 레이블

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# k-폴드 크로스 밸리데이션 설정
kfold = KFold(n_splits=12, shuffle=True, random_state=42)

# 점수 기록
scores = []

# k-폴드 크로스 밸리데이션 실행
for train_index, test_index in kfold.split(features_scaled, labels):
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # LightGBM 모델 구성 및 학습
    lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
    lgb_model.fit(X_train, y_train)

    # 모델 평가
    y_pred = lgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Test accuracy: {accuracy}')
    scores.append(accuracy)

# 평균 정확도 계산
average_accuracy = np.mean(scores)
print(f'\nK-Fold Cross-Validation Average Accuracy: {average_accuracy}')


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010587 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2330
[LightGBM] [Info] Number of data points in the train set: 132, number of used features: 51
[LightGBM] [Info] Start training from score -2.049589
[LightGBM] [Info] Start training from score -1.887070
[LightGBM] [Info] Start training from score -1.838279
[LightGBM] [Info] Start training from score -1.704748
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.550597
Test accuracy: 0.6923076923076923
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2346
[LightGBM] [Info] Number of data points in the train set: 133, number of used features: 51
[LightGBM] [Info] Start training from score -1.999977
[LightGBM] [In

In [99]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
from sklearn.metrics import classification_report
import numpy as np

model = Sequential([
    LSTM(256, input_shape=(sequence_length, num_keypoints), return_sequences=True, kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    LSTM(128, kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=Adam(0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001, verbose=1)
]

# Assuming X_train, y_train, X_val, y_val are prepared
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, callbacks=callbacks)

# 예측 수행
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)  # 원-핫 인코딩된 레이블을 가정

# 성능 지표 계산
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

precision.update_state(y_test, predictions)
recall.update_state(y_test, predictions)
accuracy.update_state(y_test, predictions)

print(f'Precision: {precision.result().numpy()}')
print(f'Recall: {recall.result().numpy()}')
print(f'Accuracy: {accuracy.result().numpy()}')

# F1 점수 및 기타 지표 계산 (scikit-learn 활용)
print(classification_report(true_classes, predicted_classes))

NameError: name 'sequence_length' is not defined