#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.


## 1.환경설정

### (1) 라이브러리 불러오기

* 세부 요구사항
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
    - 필요하다고 판단되는 라이브러리를 추가하세요.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifier


### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용

 <br/>  

* 세부 요구사항
    - data01_train.csv 를 불러와 'data' 이름으로 저장합니다.
        - data에서 변수 subject는 삭제합니다.
    - data01_test.csv 를 불러와 'new_data' 이름으로 저장합니다.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 전체 데이터 'data01_train.csv' 를 불러와 'data' 이름으로 저장
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

# data에서 변수 subject는 삭제
data.drop(columns = 'subject', inplace=True)
new_data.drop(columns = 'subject', inplace=True)

#### 2) 기본 정보 조회

In [None]:
display(data.head())
display(new_data.head())

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS


Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.284379,-0.021981,-0.116683,-0.99249,-0.97964,-0.963321,-0.992563,-0.977304,-0.958142,-0.93885,...,-0.509523,-0.850065,-0.018043,0.092304,0.07422,-0.714534,-0.671943,-0.018351,-0.185733,SITTING
1,0.27744,-0.028086,-0.118412,-0.99662,-0.927676,-0.972294,-0.997346,-0.931405,-0.971788,-0.939837,...,-0.210792,-0.613367,-0.022456,-0.155414,0.247498,-0.112257,-0.826816,0.184489,-0.068699,STANDING
2,0.305833,-0.041023,-0.087303,0.00688,0.1828,-0.237984,0.005642,0.028616,-0.236474,0.016311,...,0.579587,0.394388,-0.362616,0.171069,0.576349,-0.688314,-0.743234,0.272186,0.053101,WALKING
3,0.276053,-0.016487,-0.108381,-0.995379,-0.983978,-0.975854,-0.995877,-0.98528,-0.974907,-0.941425,...,-0.566291,-0.841455,0.289548,0.079801,-0.020033,0.291898,-0.639435,-0.111998,-0.123298,SITTING
4,0.271998,0.016904,-0.078856,-0.973468,-0.702462,-0.86945,-0.97981,-0.711601,-0.856807,-0.92076,...,0.447577,0.214219,0.010111,0.114179,-0.830776,-0.325098,-0.840817,0.116237,-0.096615,STANDING


In [None]:
print(data.shape)
print(new_data.shape)

(5881, 562)
(1471, 562)


In [None]:
print(data.info())
print(new_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Columns: 562 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), object(1)
memory usage: 25.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1471 entries, 0 to 1470
Columns: 562 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), object(1)
memory usage: 6.3+ MB
None


In [None]:
print(data.describe())
print(new_data.describe())

       tBodyAcc-mean()-X  tBodyAcc-mean()-Y  tBodyAcc-mean()-Z  \
count        5881.000000        5881.000000        5881.000000   
mean            0.274811          -0.017799          -0.109396   
std             0.067614           0.039422           0.058373   
min            -0.503823          -0.684893          -1.000000   
25%             0.262919          -0.024877          -0.121051   
50%             0.277154          -0.017221          -0.108781   
75%             0.288526          -0.010920          -0.098163   
max             1.000000           1.000000           1.000000   

       tBodyAcc-std()-X  tBodyAcc-std()-Y  tBodyAcc-std()-Z  tBodyAcc-mad()-X  \
count       5881.000000       5881.000000       5881.000000       5881.000000   
mean          -0.603138         -0.509815         -0.604058         -0.628151   
std            0.448807          0.501815          0.417319          0.424345   
min           -1.000000         -0.999844         -0.999667         -1.000000   


## 2.데이터 전처리

## (1) 상위 N개의 feature 선정

* 세부 요구사항
    - (옵션) 알고리즘 중 1~2개에 대해서, 변수 중요도 상위 N개를 선정하여 모델링하고 타 모델과 성능을 비교.
        * 상위 N개를 선택하는 방법은, 변수를 하나씩 늘려가며 모델링 및 성능 검증을 수행하여 적절한 지점을 찾는 것이다.

In [None]:
import joblib
feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')
feature_importances.columns

Index(['sensor', 'agg', 'axis', 'feature_name', 'importance_mission1',
       'importance_mission2', 'importportance_is_standing',
       'importportance_is_sitting', 'importportance_is_laying',
       'importportance_is_walking', 'importportance_is_walking_up',
       'importportance_is_walking_down'],
      dtype='object')

In [None]:
importances_col = ['importance_mission1',
       'importance_mission2', 'importportance_is_standing',
       'importportance_is_sitting', 'importportance_is_laying',
       'importportance_is_walking', 'importportance_is_walking_up',
       'importportance_is_walking_down']

feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)
feature_importances

Unnamed: 0,sensor,agg,axis,feature_name,importance_mission1,importance_mission2,importportance_is_standing,importportance_is_sitting,importportance_is_laying,importportance_is_walking,importportance_is_walking_up,importportance_is_walking_down,total_importance
0,tBodyAcc,mean(),X,tBodyAcc-mean()-X,0.000213,0.000009,0.000260,0.000489,0.000133,0.000160,0.000221,0.000205,0.001688
1,tBodyAcc,mean(),Y,tBodyAcc-mean()-Y,0.000289,0.000017,0.000431,0.000464,0.000067,0.000304,0.000638,0.000276,0.002485
2,tBodyAcc,mean(),Z,tBodyAcc-mean()-Z,0.000183,0.000034,0.000305,0.000408,0.000119,0.000082,0.000111,0.000274,0.001517
3,tBodyAcc,std(),X,tBodyAcc-std()-X,0.004241,0.000009,0.003341,0.000284,0.000177,0.010715,0.004030,0.027953,0.050750
4,tBodyAcc,std(),Y,tBodyAcc-std()-Y,0.000351,0.000000,0.000071,0.000341,0.000045,0.000204,0.000597,0.000446,0.002056
...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,angle,tBodyGyroMean,gravityMean,"angle(tBodyGyroMean,gravityMean)",0.000549,0.000000,0.000516,0.000999,0.000081,0.000090,0.000302,0.000599,0.003135
557,angle,tBodyGyroJerkMean,gravityMean,"angle(tBodyGyroJerkMean,gravityMean)",0.000753,0.000000,0.001918,0.001284,0.000000,0.000110,0.000247,0.000195,0.004507
558,angle,X,gravityMean,"angle(X,gravityMean)",0.024659,0.000111,0.020128,0.028979,0.059553,0.001284,0.007272,0.002114,0.144100
559,angle,Y,gravityMean,"angle(Y,gravityMean)",0.026835,0.000000,0.057582,0.035853,0.043076,0.004177,0.021457,0.002586,0.191566


In [None]:
# 상위 N개만 뽑았음
topN = feature_importances.sort_values('total_importance', ascending=False).head(10)['feature_name']
topN_ls = topN.to_list()
topN_ls.append('Activity')

In [None]:
# 상위 50개 뽑았을때 센서별로 뭐가 많은지 확인
feature_importances.sort_values('total_importance', ascending=False).head(50).value_counts('sensor')

sensor
tGravityAcc         15
fBodyAcc             7
fBodyAccJerk         6
fBodyAccMag          5
tBodyAccJerk         4
angle                3
tBodyAcc             2
tBodyAccMag          2
tGravityAccMag       2
fBodyGyro            1
tBodyAccJerkMag      1
tBodyGyroJerk        1
tBodyGyroJerkMag     1
dtype: int64

In [None]:
# 전체 데이터 'data01_train.csv' 를 불러와 'data' 이름으로 저장
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')

# y = data['Activity']
# x = data.drop(columns = ['Activity'])

data_topN = data[topN_ls]
data_topN['Activity'].unique()

array(['STANDING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS', 'SITTING'], dtype=object)

In [None]:
# 스케일링 (선택사항)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_topN[topN] = scaler.fit_transform(data_topN[topN])
data_topN

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_topN[topN] = scaler.fit_transform(data_topN[topN])


Unnamed: 0,tGravityAcc-max()-Y,tGravityAcc-mean()-X,tGravityAcc-min()-Y,"angle(Y,gravityMean)",tGravityAcc-max()-X,tGravityAcc-min()-X,tGravityAcc-mean()-Y,tGravityAcc-energy()-X,"angle(X,gravityMean)",tGravityAcc-energy()-Y,Activity
0,0.084381,0.939996,0.148733,0.900647,0.902470,0.945608,0.114889,0.847111,0.201725,0.112032,STANDING
1,0.935031,0.418889,0.960101,0.156150,0.397433,0.448509,0.950923,0.053734,0.680309,0.857254,LAYING
2,0.203944,0.986800,0.269332,0.813468,0.945616,0.992753,0.235228,0.964773,0.084172,0.023478,STANDING
3,0.126143,0.966872,0.186498,0.870783,0.928829,0.971781,0.154713,0.913741,0.153833,0.075452,WALKING
4,0.176305,0.953344,0.194289,0.855762,0.929016,0.950630,0.178920,0.879993,0.149176,0.057392,WALKING_DOWNSTAIRS
...,...,...,...,...,...,...,...,...,...,...,...
5876,0.294301,0.990544,0.344852,0.753647,0.949340,0.996172,0.319908,0.974518,0.044320,0.000692,SITTING
5877,0.115250,0.958404,0.176487,0.881634,0.921564,0.960225,0.141807,0.892480,0.156635,0.086519,WALKING_UPSTAIRS
5878,0.775416,0.223075,0.809293,0.380712,0.207527,0.262386,0.794759,0.000728,0.819464,0.476680,LAYING
5879,0.122627,0.963885,0.175178,0.874641,0.926022,0.965958,0.151406,0.906216,0.142611,0.078266,WALKING_UPSTAIRS


* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [None]:
non_dynamic = ['SITTING', 'STANDING', 'LAYING']
data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)
data_topN

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)


Unnamed: 0,tGravityAcc-max()-Y,tGravityAcc-mean()-X,tGravityAcc-min()-Y,"angle(Y,gravityMean)",tGravityAcc-max()-X,tGravityAcc-min()-X,tGravityAcc-mean()-Y,tGravityAcc-energy()-X,"angle(X,gravityMean)",tGravityAcc-energy()-Y,Activity,Activity_dynamic
0,0.084381,0.939996,0.148733,0.900647,0.902470,0.945608,0.114889,0.847111,0.201725,0.112032,STANDING,0
1,0.935031,0.418889,0.960101,0.156150,0.397433,0.448509,0.950923,0.053734,0.680309,0.857254,LAYING,0
2,0.203944,0.986800,0.269332,0.813468,0.945616,0.992753,0.235228,0.964773,0.084172,0.023478,STANDING,0
3,0.126143,0.966872,0.186498,0.870783,0.928829,0.971781,0.154713,0.913741,0.153833,0.075452,WALKING,1
4,0.176305,0.953344,0.194289,0.855762,0.929016,0.950630,0.178920,0.879993,0.149176,0.057392,WALKING_DOWNSTAIRS,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5876,0.294301,0.990544,0.344852,0.753647,0.949340,0.996172,0.319908,0.974518,0.044320,0.000692,SITTING,0
5877,0.115250,0.958404,0.176487,0.881634,0.921564,0.960225,0.141807,0.892480,0.156635,0.086519,WALKING_UPSTAIRS,1
5878,0.775416,0.223075,0.809293,0.380712,0.207527,0.262386,0.794759,0.000728,0.819464,0.476680,LAYING,0
5879,0.122627,0.963885,0.175178,0.874641,0.926022,0.965958,0.151406,0.906216,0.142611,0.078266,WALKING_UPSTAIRS,1


In [None]:
# x, y로 나누기
y1 = data_topN['Activity']
y2 = data_topN['Activity_dynamic']
x = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

In [None]:
# train과 validation 분할 수행
from sklearn.model_selection import train_test_split
X_train_dynamic, X_val_dynamic, y_train_dynamic, y_val_dynamic = train_test_split(x, y2, stratify = y2, random_state=42, test_size=0.25)

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

#### 1) 알고리즘1 :

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

clf_random_forest = RandomForestClassifier(random_state=42)
clf_random_forest.fit(X_train_dynamic, y_train_dynamic)

predicted_random_forest = clf_random_forest.predict(X_val_dynamic)
macro_f1_score_random_forest = f1_score(y_val_dynamic, predicted_random_forest, average='macro')
print("RandomForestClassifier Accuracy Score:", np.mean(predicted_random_forest == y_val_dynamic))
print("RandomForestClassifier Macro F1 Score:", macro_f1_score_random_forest)

RandomForestClassifier Accuracy Score: 0.9564921821889871
RandomForestClassifier Macro F1 Score: 0.9561654449013648


#### 2) 알고리즘2 :

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

clf_SGD = SGDClassifier(random_state=42)
clf_SGD.fit(X_train_dynamic, y_train_dynamic)

predicted = clf_SGD.predict(X_val_dynamic)
macro_f1_score = f1_score(y_val_dynamic, predicted, average='macro')
print('Accuracy Score :', np.mean(predicted == y_val_dynamic))
print("Macro F1 Score:", macro_f1_score)

Accuracy Score : 0.7940176750509857
Macro F1 Score: 0.7938942311027053


In [None]:
from sklearn.linear_model import LogisticRegression

clf_logistic = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf_logistic.fit(X_train_dynamic, y_train_dynamic)

predicted_logistic = clf_logistic.predict(X_val_dynamic)
macro_f1_score_logistic = f1_score(y_val_dynamic, predicted_logistic, average='macro')
print("Accuracy Score:", np.mean(predicted_logistic == y_val_dynamic))
print("Macro F1 Score:", macro_f1_score_logistic)

Accuracy Score: 0.7743031951053705
Macro F1 Score: 0.7738340020524677


In [None]:
from sklearn.svm import SVC

clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train_dynamic, y_train_dynamic)

predicted_svm = clf_svm.predict(X_val_dynamic)
macro_f1_score_svm = f1_score(y_val_dynamic, predicted_svm, average='macro')
print("SVM Accuracy Score:", np.mean(predicted_svm == y_val_dynamic))
print("SVM Macro F1 Score:", macro_f1_score_svm)


SVM Accuracy Score: 0.7933378653976887
SVM Macro F1 Score: 0.7923589829312234


In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train_dynamic, y_train_dynamic)

predicted_knn = clf_knn.predict(X_val_dynamic)
macro_f1_score_knn = f1_score(y_val_dynamic, predicted_knn, average='macro')
accuracy_knn = np.mean(predicted_knn == y_val_dynamic)
print("KNeighborsClassifier Accuracy Score:", accuracy_knn)
print("KNeighborsClassifier Macro F1 Score:", macro_f1_score_knn)


KNeighborsClassifier Accuracy Score: 0.9354180829367778
KNeighborsClassifier Macro F1 Score: 0.934793208918835


### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [None]:
data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 0]
data_Activity_dynamic

Unnamed: 0,tGravityAcc-max()-Y,tGravityAcc-mean()-X,tGravityAcc-min()-Y,"angle(Y,gravityMean)",tGravityAcc-max()-X,tGravityAcc-min()-X,tGravityAcc-mean()-Y,tGravityAcc-energy()-X,"angle(X,gravityMean)",tGravityAcc-energy()-Y,Activity,Activity_dynamic
0,0.084381,0.939996,0.148733,0.900647,0.902470,0.945608,0.114889,0.847111,0.201725,0.112032,STANDING,0
1,0.935031,0.418889,0.960101,0.156150,0.397433,0.448509,0.950923,0.053734,0.680309,0.857254,LAYING,0
2,0.203944,0.986800,0.269332,0.813468,0.945616,0.992753,0.235228,0.964773,0.084172,0.023478,STANDING,0
7,0.367394,0.978168,0.420523,0.702054,0.937574,0.983431,0.392556,0.942498,0.092447,0.007057,STANDING,0
8,0.185446,0.979644,0.208235,0.840799,0.940421,0.978761,0.197008,0.946301,0.110212,0.045169,STANDING,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5874,0.683313,0.105641,0.311401,0.653503,0.155450,0.138916,0.452940,0.033803,0.906757,0.075213,LAYING,0
5875,0.185506,0.985418,0.249911,0.827125,0.944364,0.991141,0.215998,0.961191,0.091910,0.033208,STANDING,0
5876,0.294301,0.990544,0.344852,0.753647,0.949340,0.996172,0.319908,0.974518,0.044320,0.000692,SITTING,0
5878,0.775416,0.223075,0.809293,0.380712,0.207527,0.262386,0.794759,0.000728,0.819464,0.476680,LAYING,0


In [None]:
# x, y로 나누기
y2 = data_Activity_dynamic['Activity']
x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])
# train과 validation 분할 수행
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x, y2, stratify = y2, random_state=42, test_size=0.25)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

clf_random_forest = RandomForestClassifier(random_state=42)
clf_random_forest.fit(X_train, y_train)

predicted_random_forest = clf_random_forest.predict(X_val)
macro_f1_score_random_forest = f1_score(y_val, predicted_random_forest, average='macro')
print("RandomForestClassifier Accuracy Score:", np.mean(predicted_random_forest == y_val))
print("RandomForestClassifier Macro F1 Score:", macro_f1_score_random_forest)

RandomForestClassifier Accuracy Score: 0.9629171817058096
RandomForestClassifier Macro F1 Score: 0.9622297287666939


In [None]:
from sklearn.linear_model import LogisticRegression

clf_logistic = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf_logistic.fit(X_train, y_train)

predicted_logistic = clf_logistic.predict(X_val)
macro_f1_score_logistic = f1_score(y_val, predicted_logistic, average='macro')
print("Accuracy Score:", np.mean(predicted_logistic == y_val))
print("Macro F1 Score:", macro_f1_score_logistic)

Accuracy Score: 0.9258343634116193
Macro F1 Score: 0.9242857219601407


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

clf_SGD = SGDClassifier(random_state=42)
clf_SGD.fit(X_train, y_train)

predicted = clf_SGD.predict(X_val)
macro_f1_score = f1_score(y_val, predicted, average='macro')
print('Accuracy Score :', np.mean(predicted == y_val))
print("Macro F1 Score:", macro_f1_score)

Accuracy Score : 0.9394313967861557
Macro F1 Score: 0.9383645604529267


In [None]:
from sklearn.svm import SVC

clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train, y_train)

predicted_svm = clf_svm.predict(X_val)
macro_f1_score_svm = f1_score(y_val, predicted_svm, average='macro')
print("SVM Accuracy Score:", np.mean(predicted_svm == y_val))
print("SVM Macro F1 Score:", macro_f1_score_svm)


SVM Accuracy Score: 0.927070457354759
SVM Macro F1 Score: 0.9256206701751256


In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

predicted_knn = clf_knn.predict(X_val)
macro_f1_score_knn = f1_score(y_val, predicted_knn, average='macro')
accuracy_knn = np.mean(predicted_knn == y_val)
print("KNeighborsClassifier Accuracy Score:", accuracy_knn)
print("KNeighborsClassifier Macro F1 Score:", macro_f1_score_knn)

KNeighborsClassifier Accuracy Score: 0.9542645241038319
KNeighborsClassifier Macro F1 Score: 0.9534111872544792


### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [None]:
data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 1]
data_Activity_dynamic

Unnamed: 0,tGravityAcc-max()-Y,tGravityAcc-mean()-X,tGravityAcc-min()-Y,"angle(Y,gravityMean)",tGravityAcc-max()-X,tGravityAcc-min()-X,tGravityAcc-mean()-Y,tGravityAcc-energy()-X,"angle(X,gravityMean)",tGravityAcc-energy()-Y,Activity,Activity_dynamic
3,0.126143,0.966872,0.186498,0.870783,0.928829,0.971781,0.154713,0.913741,0.153833,0.075452,WALKING,1
4,0.176305,0.953344,0.194289,0.855762,0.929016,0.950630,0.178920,0.879993,0.149176,0.057392,WALKING_DOWNSTAIRS,1
5,0.170721,0.914689,0.215525,0.848275,0.879781,0.918971,0.188806,0.786676,0.226198,0.049955,WALKING_UPSTAIRS,1
6,0.239653,0.991293,0.281862,0.793784,0.956631,0.989347,0.263052,0.976514,0.057348,0.012446,WALKING,1
12,0.148264,0.971955,0.207184,0.856965,0.936092,0.976212,0.175226,0.926632,0.124975,0.059418,WALKING,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5868,0.262115,0.955143,0.308172,0.780684,0.924716,0.956091,0.282013,0.884388,0.155935,0.006903,WALKING,1
5869,0.165126,0.964411,0.221149,0.847824,0.929142,0.965642,0.190142,0.907550,0.113527,0.048956,WALKING_UPSTAIRS,1
5873,0.080324,0.937553,0.146224,0.903825,0.900159,0.940771,0.111462,0.841179,0.201480,0.115503,WALKING,1
5877,0.115250,0.958404,0.176487,0.881634,0.921564,0.960225,0.141807,0.892480,0.156635,0.086519,WALKING_UPSTAIRS,1


In [None]:
# x, y로 나누기
y2 = data_Activity_dynamic['Activity']
x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

# train과 validation 분할 수행
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x, y2, stratify = y2, random_state=42, test_size=0.25)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

clf_random_forest = RandomForestClassifier(random_state=42)
clf_random_forest.fit(X_train, y_train)

predicted_random_forest = clf_random_forest.predict(X_val)
macro_f1_score_random_forest = f1_score(y_val, predicted_random_forest, average='macro')
print("RandomForestClassifier Accuracy Score:", np.mean(predicted_random_forest == y_val))
print("RandomForestClassifier Macro F1 Score:", macro_f1_score_random_forest)

RandomForestClassifier Accuracy Score: 0.8459214501510574
RandomForestClassifier Macro F1 Score: 0.8390055524587074


In [None]:
from sklearn.linear_model import LogisticRegression

clf_logistic = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf_logistic.fit(X_train, y_train)

predicted_logistic = clf_logistic.predict(X_val)
macro_f1_score_logistic = f1_score(y_val, predicted_logistic, average='macro')
print("Accuracy Score:", np.mean(predicted_logistic == y_val))
print("Macro F1 Score:", macro_f1_score_logistic)

Accuracy Score: 0.6283987915407855
Macro F1 Score: 0.6287370556444222


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score

clf_SGD = SGDClassifier(random_state=42)
clf_SGD.fit(X_train, y_train)

predicted = clf_SGD.predict(X_val)
macro_f1_score = f1_score(y_val, predicted, average='macro')
print('Accuracy Score :', np.mean(predicted == y_val))
print("Macro F1 Score:", macro_f1_score)

Accuracy Score : 0.6435045317220544
Macro F1 Score: 0.6368076129403447


In [None]:
from sklearn.svm import SVC

clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train, y_train)

predicted_svm = clf_svm.predict(X_val)
macro_f1_score_svm = f1_score(y_val, predicted_svm, average='macro')
print("SVM Accuracy Score:", np.mean(predicted_svm == y_val))
print("SVM Macro F1 Score:", macro_f1_score_svm)

SVM Accuracy Score: 0.6268882175226587
SVM Macro F1 Score: 0.600389835971945


In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier()
clf_knn.fit(X_train, y_train)

predicted_knn = clf_knn.predict(X_val)
macro_f1_score_knn = f1_score(y_val, predicted_knn, average='macro')
accuracy_knn = np.mean(predicted_knn == y_val)
print("KNeighborsClassifier Accuracy Score:", accuracy_knn)
print("KNeighborsClassifier Macro F1 Score:", macro_f1_score_knn)

KNeighborsClassifier Accuracy Score: 0.8398791540785498
KNeighborsClassifier Macro F1 Score: 0.8333385001966217


### (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들기

In [None]:
# 학습 및 평가 데이터 불러옴
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

# data에서 변수 subject는 삭제
data.drop(columns = 'subject', inplace=True)
new_data.drop(columns = 'subject', inplace=True)

non_dynamic = ['SITTING', 'STANDING', 'LAYING']
data['Activity_dynamic'] = data['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

# 1번 모델을 위해 x, y로 나누기
y0 = data['Activity']
y2 = data['Activity_dynamic']
x0 = data.drop(columns=['Activity_dynamic', 'Activity'])

# 1번 모델을 위해 train과 validation 분할 수행
X_train_dynamic, X_val_dynamic, y_train_dynamic, y_val_dynamic = train_test_split(x0, y2, stratify=y2, random_state=42, test_size=0.25)

### 첫번째 모델 학습
clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train_dynamic, y_train_dynamic)

# 1번 모델을 통해 예측 결과 생성
predictions_svm = clf_svm.predict(X_val_dynamic)

# 2-1을 위한 x, y 나누기
data_Activity_dynamic = data[data['Activity_dynamic'] == 0]
y1 = data_Activity_dynamic['Activity']
x = data_Activity_dynamic.drop(columns=['Activity_dynamic', 'Activity'])

X_train_Activity_0, X_val_Activity_0, y_train_Activity_0, y_val_Activity_0 = train_test_split(x, y1, stratify=y1, random_state=42, test_size=0.25)

### 두번째 모델 학습
clf_logistic = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf_logistic.fit(X_train_Activity_0, y_train_Activity_0)

# 2-2을 위한 x, y 나누기
data_Activity_dynamic = data[data['Activity_dynamic'] == 1]
y1 = data_Activity_dynamic['Activity']
x = data_Activity_dynamic.drop(columns=['Activity_dynamic', 'Activity'])

X_train_Activity_1, X_val_Activity_1, y_train_Activity_1, y_val_Activity_1 = train_test_split(x, y1, stratify=y1, random_state=42, test_size=0.25)

### 세번째 모델 학습
clf_svm_Act_1 = SVC(kernel='linear', random_state=42)
clf_svm_Act_1.fit(X_train_Activity_1, y_train_Activity_1)

from sklearn.ensemble import VotingClassifier

# 1번 모델의 결과를 활용한 투표 모델 생성
voting_clf = VotingClassifier(estimators=[
    ('svm', clf_svm),
    ('logistic', clf_logistic),
    ('svm_Act_1', clf_svm_Act_1)
], voting='hard')

# 최종 예측
final_predictions = voting_clf.predict(X_val_dynamic)

# 최종 모델 평가
final_accuracy = accuracy_score(y_val_dynamic, final_predictions)
final_macro_f1 = f1_score(y_val_dynamic, final_predictions, average='macro')
print("Final Model Accuracy:", final_accuracy)
print("Final Model Macro F1 Score:", final_macro_f1)


NotFittedError: ignored

In [None]:
# 학습 및 평가 데이터 불러옴
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

# data에서 변수 subject는 삭제
data.drop(columns = 'subject', inplace=True)
new_data.drop(columns = 'subject', inplace=True)

non_dynamic = ['SITTING', 'STANDING', 'LAYING']
data['Activity_dynamic'] = data['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

# 1번 모델을 위해 x, y로 나누기
y0 = data['Activity']
y2 = data['Activity_dynamic']
x0 = data.drop(columns = ['Activity_dynamic', 'Activity'])

# 1번 모델을 위해 train과 validation 분할 수행
X_train_dynamic, X_val_dynamic, y_train_dynamic, y_val_dynamic = train_test_split(x0, y2, stratify = y2, random_state=42, test_size=0.25)

### 첫번째 모델 학습
clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train_dynamic, y_train_dynamic)

######################################################

# 2-1을 위한 x, y나누기
data_Activity_dynamic = data[data['Activity_dynamic'] == 0]
y1 = data_Activity_dynamic['Activity']
x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

X_train_Activity_0, X_val_Activity_0, y_train_Activity_0, y_val_Activity_0 = train_test_split(x, y1, stratify = y1, random_state=42, test_size=0.25)

### 두번째 모델 학습
clf_logistic = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf_logistic.fit(X_train_Activity_0, y_train_Activity_0)

#######################################################

# 2-2을 위한 x, y나누기
data_Activity_dynamic = data[data['Activity_dynamic'] == 1]
y1 = data_Activity_dynamic['Activity']
x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

X_train_Activity_1, X_val_Activity_1, y_train_Activity_1, y_val_Activity_1 = train_test_split(x, y1, stratify = y1, random_state=42, test_size=0.25)

##########################################################

### 세번째 모델 학습
clf_svm_Act_1 = SVC(kernel='linear', random_state=42)
clf_svm_Act_1.fit(X_train_Activity_1, y_train_Activity_1)


# 모델 평가
# predictions_svm = clf_svm.predict(X_val_dynamic)
# predictions_logistic = clf_logistic.predict(X_val_Activity_0)
# predictions_svm_Act_1 = clf_svm_Act_1.predict(X_val_dynamic)

from sklearn.ensemble import VotingClassifier

# 투표 모델 생성
voting_clf = VotingClassifier(estimators=[('svm', clf_svm), ('logistic', clf_logistic), ('svm_Act_1', clf_svm_Act_1)], voting='hard')

X_train, X_val, y_train, y_val = train_test_split(x0, y0, stratify = y0, random_state=42, test_size=0.25)

# 투표 모델 학습
voting_clf.fit(X_val, y_val)

# 최종 예측
final_predictions = voting_clf.predict(X_val)

from sklearn.metrics import accuracy_score, f1_score

final_accuracy = accuracy_score(y_val, final_predictions)
final_macro_f1 = f1_score(y_val, final_predictions, average='macro')
print("Final Model Accuracy:", final_accuracy)
print("Final Model Macro F1 Score:", final_macro_f1)


## (1) SVC + LR + SVC

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')
    feature_importances.columns

    importances_col = ['importance_mission1',
        'importance_mission2', 'importportance_is_standing',
        'importportance_is_sitting', 'importportance_is_laying',
        'importportance_is_walking', 'importportance_is_walking_up',
        'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)
    feature_importances

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]

    # 스케일링 (선택사항)
    # from sklearn.preprocessing import MinMaxScaler
    # scaler = MinMaxScaler()
    # data_topN[topN] = scaler.fit_transform(data_topN[topN])
    # data_topN.loc[:, topN] = scaler.fit_transform(data_topN.loc[:, topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # 1번 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    # 1번 모델을 위해 train과 validation 분할 수행
    X_train_dynamic, X_val_dynamic, y_train_dynamic, y_val_dynamic = train_test_split(x0, y2, stratify = y2, random_state=42, test_size=0.25)

    ### 첫번째 모델 학습
    clf_svm = SVC(kernel='linear', random_state=42)
    clf_svm.fit(X_train_dynamic, y_train_dynamic)

    ######################################################

    # 2-1을 위한 x, y나누기
    data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 0]
    y1 = data_Activity_dynamic['Activity']
    x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

    X_train_Activity_0, X_val_Activity_0, y_train_Activity_0, y_val_Activity_0 = train_test_split(x, y1, stratify = y1, random_state=42, test_size=0.25)

    ### 두번째 모델 학습
    clf_logistic = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
    clf_logistic.fit(X_train_Activity_0, y_train_Activity_0)

    #######################################################

    # 2-2을 위한 x, y나누기
    data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 1]
    y1 = data_Activity_dynamic['Activity']
    x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

    X_train_Activity_1, X_val_Activity_1, y_train_Activity_1, y_val_Activity_1 = train_test_split(x, y1, stratify = y1, random_state=42, test_size=0.25)

    ##########################################################

    ### 세번째 모델 학습
    clf_svm_Act_1 = SVC(kernel='linear', random_state=42)
    clf_svm_Act_1.fit(X_train_Activity_1, y_train_Activity_1)

    from sklearn.ensemble import VotingClassifier

    # 투표 모델 생성
    voting_clf = VotingClassifier(estimators=[('svm', clf_svm), ('logistic', clf_logistic), ('svm_Act_1', clf_svm_Act_1)], voting='hard')

    X_train, X_val, y_train, y_val = train_test_split(x0, y2, stratify = y2, random_state=42, test_size=0.25)

    # 투표 모델 학습
    voting_clf.fit(X_val, y_val)

    # 최종 예측
    final_predictions = voting_clf.predict(X_val)

    from sklearn.metrics import accuracy_score, f1_score

    final_accuracy = accuracy_score(y_val, final_predictions)
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')

    return final_accuracy, final_macro_f1

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(1, 50):
    final_accuracy, final_macro_f1 = evaluate_models(n)
    print(f"n={n}: Final Model Accuracy={final_accuracy:.4f}, Final Model Macro F1 Score={final_macro_f1:.4f}")

## (2) KNN + RF + RF

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')
    feature_importances.columns

    importances_col = ['importance_mission1',
        'importance_mission2', 'importportance_is_standing',
        'importportance_is_sitting', 'importportance_is_laying',
        'importportance_is_walking', 'importportance_is_walking_up',
        'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)
    feature_importances

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]

    # 스케일링 (선택사항)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_topN[topN] = scaler.fit_transform(data_topN[topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # 1번 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    # 1번 모델을 위해 train과 validation 분할 수행
    X_train_dynamic, X_val_dynamic, y_train_dynamic, y_val_dynamic = train_test_split(x0, y2, stratify = y2, random_state=42, test_size=0.25)

    # 2-1을 위한 x, y나누기
    data_Activity_dynamic_0 = data_topN[data_topN['Activity_dynamic'] == 0]
    y1 = data_Activity_dynamic_0['Activity']
    x = data_Activity_dynamic_0.drop(columns = ['Activity_dynamic', 'Activity'])

    X_train_Activity_0, X_val_Activity_0, y_train_Activity_0, y_val_Activity_0 = train_test_split(x, y1, stratify = y1, random_state=42, test_size=0.25)

    # 2-2을 위한 x, y나누기
    data_Activity_dynamic_1 = data_topN[data_topN['Activity_dynamic'] == 1]
    y1 = data_Activity_dynamic_1['Activity']
    x = data_Activity_dynamic_1.drop(columns = ['Activity_dynamic', 'Activity'])

    X_train_Activity_1, X_val_Activity_1, y_train_Activity_1, y_val_Activity_1 = train_test_split(x, y1, stratify = y1, random_state=42, test_size=0.25)

    # 첫 번째 모델 학습
    clf_knn = KNeighborsClassifier()
    clf_knn.fit(X_train_dynamic, y_train_dynamic)

    # 두 번째 모델 학습
    clf_random_forest1 = RandomForestClassifier(random_state=42)
    clf_random_forest1.fit(X_train_Activity_0, y_train_Activity_0)

    # 세 번째 모델 학습
    clf_random_forest2 = RandomForestClassifier(random_state=42)
    clf_random_forest2.fit(X_train_Activity_1, y_train_Activity_1)

    # 보팅 분류기 모델 생성
    voting_clf = VotingClassifier(estimators=[('clf_knn', clf_knn), ('clf_random_forest1', clf_random_forest1), ('clf_random_forest2', clf_random_forest2)], voting='hard')

    # 전체 데이터로 모델을 평가
    X_train, X_val, y_train, y_val = train_test_split(x0, y0, stratify = y0, random_state=42, test_size=0.25)

    # 보팅 분류기 모델 학습
    voting_clf.fit(X_train, y_train)

    # 최종 예측
    final_predictions = voting_clf.predict(X_val)

    # 모델 평가
    final_accuracy = accuracy_score(y_val, final_predictions)
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')

    return final_accuracy, final_macro_f1

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(30, 51):
    final_accuracy, final_macro_f1 = evaluate_models(n)
    print(f"n={n}: Final Model Accuracy={final_accuracy:.4f}, Final Model Macro F1 Score={final_macro_f1:.4f}")



# 4.최종 분류 모델 (전체 데이터 넣어서 모델 학습)

## (1) SVC + LR + SVC

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

    importances_col = ['importance_mission1','importance_mission2',
                       'importportance_is_standing', 'importportance_is_sitting',
                       'importportance_is_laying', 'importportance_is_walking',
                       'importportance_is_walking_up', 'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]
    new_data_topN = new_data[topN_ls]

    # 스케일링 (선택사항)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_topN[topN] = scaler.fit_transform(data_topN[topN])
    new_data_topN[topN] = scaler.fit_transform(new_data_topN[topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # voting 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    models = [
        ('clf_svm',  SVC(kernel='linear', probability=True)),
        ('clf_logistic', LogisticRegression(class_weight='balanced', max_iter=1000)),
        ('clf_svm_Act_1',SVC(kernel='linear', probability=True))
    ]

    from sklearn.ensemble import VotingClassifier

    # 투표 모델 생성
    voting_soft = VotingClassifier(models, voting='soft')
    voting_hard = VotingClassifier(models, voting='hard')

    # 투표 모델 학습
    voting_soft.fit(x0, y0)
    voting_hard.fit(x0, y0)

    y_val = new_data_topN['Activity']
    X_val = new_data_topN.drop(columns = ['Activity'])

    # 최종 예측
    final_predictions = voting_soft.predict(X_val)
    final_predictions2 = voting_hard.predict(X_val)

    from sklearn.metrics import accuracy_score, f1_score

    final_accuracy = accuracy_score(y_val, final_predictions)
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')

    final_accuracy2 = accuracy_score(y_val, final_predictions2)
    final_macro_f12 = f1_score(y_val, final_predictions2, average='macro')

    return final_accuracy, final_macro_f1, final_accuracy2, final_macro_f12

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(10, 15):
    final_accuracy, final_macro_f1, final_accuracy2, final_macro_f12 = evaluate_models(n)
    print(f"n={n}: Final voting_soft Accuracy={final_accuracy:.4f}, Final voting_soft Macro F1 Score={final_macro_f1:.4f}")
    print(f"n={n}: Final voting_soft Accuracy={final_accuracy2:.4f}, Final voting_soft Macro F1 Score={final_macro_f12:.4f}")

n=10: Final voting_soft Accuracy=0.5948, Final voting_soft Macro F1 Score=0.5570
n=10: Final voting_soft Accuracy=0.5690, Final voting_soft Macro F1 Score=0.4865
n=11: Final voting_soft Accuracy=0.8232, Final voting_soft Macro F1 Score=0.8160
n=11: Final voting_soft Accuracy=0.8260, Final voting_soft Macro F1 Score=0.8182
n=12: Final voting_soft Accuracy=0.8287, Final voting_soft Macro F1 Score=0.8212
n=12: Final voting_soft Accuracy=0.8280, Final voting_soft Macro F1 Score=0.8195
n=13: Final voting_soft Accuracy=0.8328, Final voting_soft Macro F1 Score=0.8256
n=13: Final voting_soft Accuracy=0.8294, Final voting_soft Macro F1 Score=0.8213
n=14: Final voting_soft Accuracy=0.8280, Final voting_soft Macro F1 Score=0.8188
n=14: Final voting_soft Accuracy=0.8266, Final voting_soft Macro F1 Score=0.8167


## (2) KNN + RF + RF

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

    importances_col = ['importance_mission1','importance_mission2',
                       'importportance_is_standing', 'importportance_is_sitting',
                       'importportance_is_laying', 'importportance_is_walking',
                       'importportance_is_walking_up', 'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]
    new_data_topN = new_data[topN_ls]

    # 스케일링 (선택사항)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_topN[topN] = scaler.fit_transform(data_topN[topN])
    new_data_topN[topN] = scaler.fit_transform(new_data_topN[topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # 1번 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    # 첫 번째 모델 학습
    clf_knn = KNeighborsClassifier()
    clf_knn.fit(x0, y2)

    ######################################################

    # 2-1을 위한 x, y나누기
    data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 0]
    y1 = data_Activity_dynamic['Activity']
    x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

    # 두 번째 모델 학습
    clf_random_forest1 = RandomForestClassifier(random_state=42)
    clf_random_forest1.fit(x, y1)

    #######################################################

    # 2-2을 위한 x, y나누기
    data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 1]
    y1 = data_Activity_dynamic['Activity']
    x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

    ##########################################################

    ### 세번째 모델 학습
    clf_random_forest2 = RandomForestClassifier(random_state=42)
    clf_random_forest2.fit(x, y1)

    from sklearn.ensemble import VotingClassifier

    # 투표 모델 생성
    voting_clf = VotingClassifier(estimators=[('clf_knn', clf_knn), ('clf_random_forest1', clf_random_forest1), ('clf_random_forest2', clf_random_forest2)], voting='hard')

    y_val = new_data_topN['Activity']
    X_val = new_data_topN.drop(columns = ['Activity'])

    # 투표 모델 학습
    voting_clf.fit(x0, y0)  # 학습 데이터를 사용하여 모델 학습

    # 최종 예측
    final_predictions = voting_clf.predict(X_val)  # 학습 데이터를 사용하여 평가

    from sklearn.metrics import accuracy_score, f1_score

    final_accuracy = accuracy_score(y_val, final_predictions)  # 학습 데이터를 사용한 평가
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')  # 학습 데이터를 사용한 평가

    return final_accuracy, final_macro_f1

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(1, 50):
    final_accuracy, final_macro_f1 = evaluate_models(n)
    print(f"n={n}: Final Model Accuracy={final_accuracy:.4f}, Final Model Macro F1 Score={final_macro_f1:.4f}")

## (3) RF + RF + RF

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

    importances_col = ['importance_mission1','importance_mission2',
                       'importportance_is_standing', 'importportance_is_sitting',
                       'importportance_is_laying', 'importportance_is_walking',
                       'importportance_is_walking_up', 'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]
    new_data_topN = new_data[topN_ls]

    # 스케일링 (선택사항)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_topN[topN] = scaler.fit_transform(data_topN[topN])
    new_data_topN[topN] = scaler.fit_transform(new_data_topN[topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # 1번 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    # 첫 번째 모델 학습
    clf_random_forest = RandomForestClassifier(random_state=42)
    clf_random_forest.fit(x0, y2)

    ######################################################

    # 2-1을 위한 x, y나누기
    data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 0]
    y1 = data_Activity_dynamic['Activity']
    x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

    # 두 번째 모델 학습
    clf_random_forest1 = RandomForestClassifier(random_state=42)
    clf_random_forest1.fit(x, y1)

    #######################################################

    # 2-2을 위한 x, y나누기
    data_Activity_dynamic = data_topN[data_topN['Activity_dynamic'] == 1]
    y1 = data_Activity_dynamic['Activity']
    x = data_Activity_dynamic.drop(columns = ['Activity_dynamic', 'Activity'])

    ##########################################################

    ### 세번째 모델 학습
    clf_random_forest2 = RandomForestClassifier(random_state=42)
    clf_random_forest2.fit(x, y1)

    from sklearn.ensemble import VotingClassifier

    # 투표 모델 생성
    voting_clf = VotingClassifier(estimators=[('clf_random_forest', clf_random_forest), ('clf_random_forest1', clf_random_forest1), ('clf_random_forest2', clf_random_forest2)], voting='hard')

    y_val = new_data_topN['Activity']
    X_val = new_data_topN.drop(columns = ['Activity'])

    # 투표 모델 학습
    voting_clf.fit(x0, y0)  # 학습 데이터를 사용하여 모델 학습

    # 최종 예측
    final_predictions = voting_clf.predict(X_val)  # 학습 데이터를 사용하여 평가

    from sklearn.metrics import accuracy_score, f1_score

    final_accuracy = accuracy_score(y_val, final_predictions)  # 학습 데이터를 사용한 평가
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')  # 학습 데이터를 사용한 평가

    return final_accuracy, final_macro_f1

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(1, 50):
    final_accuracy, final_macro_f1 = evaluate_models(n)
    print(f"n={n}: Final Model Accuracy={final_accuracy:.4f}, Final Model Macro F1 Score={final_macro_f1:.4f}")

## (4) 모델 10개 voting

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

    importances_col = ['importance_mission1','importance_mission2',
                       'importportance_is_standing', 'importportance_is_sitting',
                       'importportance_is_laying', 'importportance_is_walking',
                       'importportance_is_walking_up', 'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]
    new_data_topN = new_data[topN_ls]

    # 스케일링 (선택사항)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_topN[topN] = scaler.fit_transform(data_topN[topN])
    new_data_topN[topN] = scaler.fit_transform(new_data_topN[topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # voting 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    models = [
        ('ada', AdaBoostClassifier()),
        ('bc', BaggingClassifier()),
        ('etc',ExtraTreesClassifier()),
        ('gbc', GradientBoostingClassifier()),
        ('rfc', RandomForestClassifier()),
        ('knn', KNeighborsClassifier()),
        ('svc', SVC(probability=True)),
        ('xgb', XGBClassifier()),
        # ('lgbm', LGBMClassifier()),
        ('dtc', DecisionTreeClassifier()),
        ('lr', LogisticRegressionCV()),
        # ('ridge', RidgeClassifier()),
    ]

    from sklearn.ensemble import VotingClassifier

    # 투표 모델 생성
    voting_soft = VotingClassifier(models, voting='soft')
    voting_hard = VotingClassifier(models, voting='hard')

    # 투표 모델 학습
    voting_soft.fit(x0, y0)
    voting_hard.fit(x0, y0)

    y_val = new_data_topN['Activity']
    X_val = new_data_topN.drop(columns = ['Activity'])

    # 최종 예측
    final_predictions = voting_soft.predict(X_val)
    final_predictions2 = voting_hard.predict(X_val)

    from sklearn.metrics import accuracy_score, f1_score

    final_accuracy = accuracy_score(y_val, final_predictions)
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')

    final_accuracy2 = accuracy_score(y_val, final_predictions2)
    final_macro_f12 = f1_score(y_val, final_predictions2, average='macro')

    return final_accuracy, final_macro_f1, final_accuracy2, final_macro_f12

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(40, 51):
    final_accuracy, final_macro_f1, final_accuracy2, final_macro_f12 = evaluate_models(n)
    print(f"n={n}: Final voting_soft Accuracy={final_accuracy:.4f}, Final voting_soft Macro F1 Score={final_macro_f1:.4f}")
    print(f"n={n}: Final voting_soft Accuracy={final_accuracy2:.4f}, Final voting_soft Macro F1 Score={final_macro_f12:.4f}")

n=40: Final voting_soft Accuracy=0.9470, Final voting_soft Macro F1 Score=0.9459
n=40: Final voting_soft Accuracy=0.9409, Final voting_soft Macro F1 Score=0.9386
n=41: Final voting_soft Accuracy=0.9470, Final voting_soft Macro F1 Score=0.9455
n=41: Final voting_soft Accuracy=0.9415, Final voting_soft Macro F1 Score=0.9389
n=42: Final voting_soft Accuracy=0.9490, Final voting_soft Macro F1 Score=0.9477
n=42: Final voting_soft Accuracy=0.9436, Final voting_soft Macro F1 Score=0.9413
n=43: Final voting_soft Accuracy=0.9490, Final voting_soft Macro F1 Score=0.9480
n=43: Final voting_soft Accuracy=0.9429, Final voting_soft Macro F1 Score=0.9405
n=44: Final voting_soft Accuracy=0.9511, Final voting_soft Macro F1 Score=0.9500
n=44: Final voting_soft Accuracy=0.9415, Final voting_soft Macro F1 Score=0.9392
n=45: Final voting_soft Accuracy=0.9490, Final voting_soft Macro F1 Score=0.9482
n=45: Final voting_soft Accuracy=0.9388, Final voting_soft Macro F1 Score=0.9364
n=46: Final voting_soft Accu

## (5) 모델 10개 stacking

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

    importances_col = ['importance_mission1','importance_mission2',
                       'importportance_is_standing', 'importportance_is_sitting',
                       'importportance_is_laying', 'importportance_is_walking',
                       'importportance_is_walking_up', 'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]
    new_data_topN = new_data[topN_ls]

    # 스케일링 (선택사항)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_topN[topN] = scaler.fit_transform(data_topN[topN])
    new_data_topN[topN] = scaler.fit_transform(new_data_topN[topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # 1번 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    models = [
        ('ada', AdaBoostClassifier()),
        ('bc', BaggingClassifier()),
        ('etc',ExtraTreesClassifier()),
        ('gbc', GradientBoostingClassifier()),
        ('rfc', RandomForestClassifier()),
        ('knn', KNeighborsClassifier()),
        ('svc', SVC(probability=True)),
        ('xgb', XGBClassifier()),
        # ('lgbm', LGBMClassifier()),
        ('dtc', DecisionTreeClassifier()),
        ('lr', LogisticRegressionCV()),
        # ('ridge', RidgeClassifier()),
    ]

    # 투표 모델 생성 ('clf_logistic', clf_logistic),
    from sklearn.ensemble import StackingClassifier
    stacking_clf = StackingClassifier(estimators=models)

    y_val = new_data_topN['Activity']
    X_val = new_data_topN.drop(columns = ['Activity'])

    # 투표 모델 학습
    stacking_clf.fit(x0, y0)  # 학습 데이터를 사용하여 모델 학습

    # 최종 예측
    final_predictions = stacking_clf.predict(X_val)  # 학습 데이터를 사용하여 평가

    # 최종 모델 평가
    final_accuracy = accuracy_score(y_val, final_predictions)
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')

    return final_accuracy, final_macro_f1

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(40, 51):
    final_accuracy, final_macro_f1 = evaluate_models(n)
    print(f"n={n}: Final Model Accuracy={final_accuracy:.4f}, Final Model Macro F1 Score={final_macro_f1:.4f}")

n=40: Final Model Accuracy=0.9531, Final Model Macro F1 Score=0.9515
n=41: Final Model Accuracy=0.9538, Final Model Macro F1 Score=0.9528
n=42: Final Model Accuracy=0.9517, Final Model Macro F1 Score=0.9505
n=43: Final Model Accuracy=0.9545, Final Model Macro F1 Score=0.9534
n=44: Final Model Accuracy=0.9572, Final Model Macro F1 Score=0.9558
n=45: Final Model Accuracy=0.9579, Final Model Macro F1 Score=0.9566
n=46: Final Model Accuracy=0.9606, Final Model Macro F1 Score=0.9599
n=47: Final Model Accuracy=0.9599, Final Model Macro F1 Score=0.9592
n=48: Final Model Accuracy=0.9619, Final Model Macro F1 Score=0.9615
n=49: Final Model Accuracy=0.9626, Final Model Macro F1 Score=0.9616
n=50: Final Model Accuracy=0.9626, Final Model Macro F1 Score=0.9620


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")

def evaluate_models(n):
    # 변수 중요도 선정
    feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

    importances_col = ['importance_mission1','importance_mission2',
                       'importportance_is_standing', 'importportance_is_sitting',
                       'importportance_is_laying', 'importportance_is_walking',
                       'importportance_is_walking_up', 'importportance_is_walking_down']

    feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

    # 상위 N개만 뽑았음
    topN = feature_importances.sort_values('total_importance', ascending=False).head(n)['feature_name']
    topN_ls = topN.to_list()
    topN_ls.append('Activity')

    # 학습 및 평가 데이터 불러옴
    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
    new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

    # data에서 변수 subject는 삭제
    data.drop(columns = 'subject', inplace=True)
    new_data.drop(columns = 'subject', inplace=True)

    data_topN = data[topN_ls]
    new_data_topN = new_data[topN_ls]

    # 스케일링 (선택사항)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_topN[topN] = scaler.fit_transform(data_topN[topN])
    new_data_topN[topN] = scaler.fit_transform(new_data_topN[topN])

    non_dynamic = ['SITTING', 'STANDING', 'LAYING']
    data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

    # 1번 모델을 위해 x, y로 나누기
    y0 = data_topN['Activity']
    y2 = data_topN['Activity_dynamic']
    x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

    # 첫 번째 모델 학습
    clf_logistic = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

    # 두 번째 모델 학습
    clf_random_forest1 = RandomForestClassifier(random_state=42)

    ### 세번째 모델 학습
    clf_random_forest2 = RandomForestClassifier(random_state=42)

    from sklearn.ensemble import VotingClassifier

    # 투표 모델 생성
    voting_clf = VotingClassifier(estimators=[('clf_logistic', clf_logistic), ('clf_random_forest1', clf_random_forest1), ('clf_random_forest2', clf_random_forest2)], voting='hard')

    y_val = new_data_topN['Activity']
    X_val = new_data_topN.drop(columns = ['Activity'])

    # 투표 모델 학습
    voting_clf.fit(x0, y0)  # 학습 데이터를 사용하여 모델 학습

    # 최종 예측
    final_predictions = voting_clf.predict(X_val)  # 학습 데이터를 사용하여 평가

    from sklearn.metrics import accuracy_score, f1_score

    final_accuracy = accuracy_score(y_val, final_predictions)  # 학습 데이터를 사용한 평가
    final_macro_f1 = f1_score(y_val, final_predictions, average='macro')  # 학습 데이터를 사용한 평가

    return final_accuracy, final_macro_f1

# n 값을 1부터 50까지 변화시키면서 결과 출력
for n in range(9, 31):
    final_accuracy, final_macro_f1 = evaluate_models(n)
    print(f"n={n}: Final Model Accuracy={final_accuracy:.4f}, Final Model Macro F1 Score={final_macro_f1:.4f}")

In [None]:
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import keras

schedule = CosineDecayRestarts(initial_learning_rate=1e-3,
                               first_decay_steps=50,
                               t_mul=2.0,
                               m_mul=0.6,
                               alpha=1e-6
                               )

# Define the optimizer with the initial learning rate
optimizer = Adam(learning_rate=schedule)

es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=10,
                   verbose=1,
                   restore_best_weights=True)

# 세션 클리어
clear_session()
# 레이어 연결

feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

importances_col = ['importance_mission1','importance_mission2',
                'importportance_is_standing', 'importportance_is_sitting',
                'importportance_is_laying', 'importportance_is_walking',
                'importportance_is_walking_up', 'importportance_is_walking_down']

feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

# 상위 N개만 뽑았음
topN = feature_importances.sort_values('total_importance', ascending=False).head(50)['feature_name']
topN_ls = topN.to_list()
topN_ls.append('Activity')

# 학습 및 평가 데이터 불러옴
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

# data에서 변수 subject는 삭제
data.drop(columns = 'subject', inplace=True)
new_data.drop(columns = 'subject', inplace=True)

data_topN = data[topN_ls]
new_data_topN = new_data[topN_ls]

# 스케일링 (선택사항)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_topN[topN] = scaler.fit_transform(data_topN[topN])
new_data_topN[topN] = scaler.transform(new_data_topN[topN])

non_dynamic = ['SITTING', 'STANDING', 'LAYING']
data_topN['Activity_dynamic'] = data_topN['Activity'].map(lambda x: 0 if x in non_dynamic else 1)

# 1번 모델을 위해 x, y로 나누기
y0 = data_topN['Activity']
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y0 = label_encoder.fit_transform(y0)
x0 = data_topN.drop(columns = ['Activity_dynamic', 'Activity'])

il = Input(shape=(len(x0 .columns),))
hl = Dense(256, activation='relu')(il)
hl = Dense(128, activation='relu')(hl)
hl = Dense(256, activation='relu')(hl)
hl = Dropout(0.2)(hl)
hl = Dense(512, activation='relu')(hl)
hl = Dropout(0.2)(hl)
hl = Dense(1024, activation='relu')(hl)
hl = Dropout(0.2)(hl)
hl = Dense(512, activation='relu')(hl)
hl = Dropout(0.2)(hl)
hl = Dense(256, activation='relu')(hl)
hl = Dropout(0.2)(hl)
ol = Dense(6, activation='softmax')(hl)

# 모델 선언
model = Model(il, ol)
# 컴파일
model.compile(loss=keras.losses.sparse_categorical_crossentropy,
              metrics=['accuracy'],
              optimizer=optimizer)
# 요약
model.summary()

model.fit(x0 , y0, epochs=1000, verbose=1, validation_split=0.2, callbacks=[es])

new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')

new_data.drop(columns = 'subject', inplace=True)

new_data_topN = new_data[topN_ls]

y_val = new_data_topN['Activity']
y_val = label_encoder.transform(y_val)  # 정답 레이블을 숫자로 변환
X_val = new_data_topN.drop(columns = ['Activity'])


# 모델을 사용하여 테스트 데이터에 대한 예측 수행
y_pred = model.predict(X_val)
y_pred_labels = y_pred.argmax(axis=1)  # 확률 분포에서 가장 높은 확률을 갖는 클래스 선택

from sklearn.metrics import accuracy_score, classification_report

# 정확도 계산
accuracy = accuracy_score(y_val, y_pred_labels)
print(f"Accuracy: {accuracy:.4f}")

# 클래스별 성능 평가
class_names = label_encoder.classes_
report = classification_report(y_val, y_pred_labels, target_names=class_names)
print("Classification Report:")
print(report)


In [None]:
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import keras
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

# Clear the session
clear_session()

# Load and preprocess data
feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

importances_col = ['importance_mission1','importance_mission2',
                'importportance_is_standing', 'importportance_is_sitting',
                'importportance_is_laying', 'importportance_is_walking',
                'importportance_is_walking_up', 'importportance_is_walking_down']

feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

# Select the top N features
topN = feature_importances.sort_values('total_importance', ascending=False).head(500)['feature_name']
topN_ls = topN.to_list()
topN_ls.append('Activity')

# Load training data
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
data.drop(columns='subject', inplace=True)
data_topN = data[topN_ls]

# Apply scaling
scaler = MinMaxScaler()
data_topN[topN] = scaler.fit_transform(data_topN[topN])

# Encode labels
label_encoder = LabelEncoder()
y = data_topN['Activity']
y = y.map({'LAYING':0, 'SITTING':1, 'STANDING':2, 'WALKING_DOWNSTAIRS':3, 'WALKING_UPSTAIRS':4, 'WALKING':5}.get)
x = data_topN.drop(columns=['Activity'])

# Define the learning rate schedule
schedule = CosineDecayRestarts(initial_learning_rate=1e-3,
                               first_decay_steps=50,
                               t_mul=2.0,
                               m_mul=0.6,
                               alpha=1e-6
                               )

# Define the optimizer with the initial learning rate
optimizer = Adam(learning_rate=schedule)

# Early stopping callback
es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=10,
                   verbose=1,
                   restore_best_weights=True)

# Model architecture
il = Input(shape=(len(x.columns),))
hl = Dense(1024, activation='relu')(il)
hl = Dropout(0.2)(hl)
hl = Dense(256, activation='relu')(hl)
hl = Dropout(0.2)(hl)
hl = Dense(128, activation='relu')(hl)
hl = Dropout(0.2)(hl)
ol = Dense(6, activation='softmax')(hl)

# Model declaration
model = Model(il, ol)

# Compile the model
model.compile(loss=keras.losses.sparse_categorical_crossentropy,
              metrics=['accuracy'],
              optimizer=optimizer)

# Model summary
model.summary()

# Train the model
model.fit(x, y, epochs=3, verbose=1, validation_split=0.25, callbacks=[es])

In [None]:
# Load test data
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')
new_data.drop(columns='subject', inplace=True)
new_data_topN = new_data[topN_ls]
new_data_topN
X_val = new_data_topN.iloc[:,:-1]
y_val = new_data_topN.iloc[:,-1:]
y_val = y_val.applymap({'LAYING':0, 'SITTING':1, 'STANDING':2, 'WALKING_DOWNSTAIRS':3, 'WALKING_UPSTAIRS':4, 'WALKING':5}.get).values

# Make predictions on the test data
y_pred = model.predict(X_val)
y_pred_labels = y_pred.argmax(axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_labels)
print(f"Accuracy: {accuracy:.4f}")

# Generate classification report
report = classification_report(y_val, y_pred_labels)
print("Classification Report:")
print(report)

## 딥러닝 1D CNN

In [None]:
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

# Clear the session
clear_session()

# Load and preprocess data
feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

importances_col = ['importance_mission1','importance_mission2',
                'importportance_is_standing', 'importportance_is_sitting',
                'importportance_is_laying', 'importportance_is_walking',
                'importportance_is_walking_up', 'importportance_is_walking_down']

feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

# Select the top N features
topN = feature_importances.sort_values('total_importance', ascending=False)['feature_name']
topN_ls = topN.to_list()
topN_ls.append('Activity')

# Load training data
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
data.drop(columns='subject', inplace=True)
data_topN = data[topN_ls]

# Apply scaling
scaler = MinMaxScaler()
data_topN[topN] = scaler.fit_transform(data_topN[topN])

# Encode labels
label_encoder = LabelEncoder()
y = data_topN['Activity']
y = label_encoder.fit_transform(y)
x = data_topN.drop(columns=['Activity'])

# Model architecture (CNN)
il = Input(shape=(len(x.columns), 1))  # Input shape for 1D convolution
cl = Conv1D(64, 3, activation='relu')(il)
cl = MaxPooling1D(2)(cl)
cl = Flatten()(cl)
hl = Dense(128, activation='relu')(cl)
hl = Dropout(0.2)(hl)
ol = Dense(6, activation='softmax')(hl)

# Model declaration
model = Model(il, ol)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Model summary
model.summary()

# Train the model
model.fit(x.values[:, :, np.newaxis], y, epochs=10, batch_size=64, verbose=1, validation_split=0.25, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

# Load test data
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')
new_data.drop(columns='subject', inplace=True)
new_data_topN = new_data[topN_ls]

X_val = new_data_topN.iloc[:,:-1]
y_val = new_data_topN.iloc[:,-1:]
y_val = label_encoder.transform(y_val)

# Make predictions on the test data
y_pred = model.predict(X_val.values[:, :, np.newaxis])
y_pred_labels = y_pred.argmax(axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_labels)
print(f"Accuracy: {accuracy:.4f}")

# Generate classification report
report = classification_report(y_val, y_pred_labels, target_names=label_encoder.classes_)
print("Classification Report:")
print(report)


## 딥러닝 RNN

In [None]:
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Input, SimpleRNN, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

# Clear the session
clear_session()

# Load and preprocess data
feature_importances = joblib.load('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/feature1.pkl')

importances_col = ['importance_mission1','importance_mission2',
                'importportance_is_standing', 'importportance_is_sitting',
                'importportance_is_laying', 'importportance_is_walking',
                'importportance_is_walking_up', 'importportance_is_walking_down']

feature_importances['total_importance'] = feature_importances[importances_col].sum(axis=1)

# Select the top N features
topN = feature_importances.sort_values('total_importance', ascending=False)['feature_name']
topN_ls = topN.to_list()
topN_ls.append('Activity')

# Load training data
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_train.csv')
data.drop(columns='subject', inplace = True)
data_topN = data[topN_ls]

# Apply scaling
scaler = MinMaxScaler()
data_topN[topN] = scaler.fit_transform(data_topN[topN])

# Encode labels
label_encoder = LabelEncoder()
y = data_topN['Activity']
y = label_encoder.fit_transform(y)
x = data_topN.drop(columns='Activity')

# Model architecture (RNN)
il = Input(shape=(len(x.columns), 1) ) # Input shape for RNN
rnn = SimpleRNN(128, activation='relu', return_sequences=True)(il)
rnn = SimpleRNN(128, activation='relu')(rnn)
hl = Dense(128, activation='relu')(rnn)
hl = Dropout(0.2)(hl)
ol = Dense(6, activation='softmax')(hl)

# Model declaration
model = Model(il, ol)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', metrics='accuracy', optimizer='adam')

# Model summary
model.summary()

# Train the model
model.fit(x.values[:, :, np.newaxis], y, epochs =10, batch_size =64, verbose =1, validation_split =0.25, callbacks=[EarlyStopping(monitor='val_loss', patience =5)])

# Load test data
new_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/KT AIVLE/5차 미프/실습파일/데이터/data01_test.csv')
new_data.drop(columns='subject', inplace = True)
new_data_topN = new_data[topN_ls]

X_val = new_data_topN.iloc[:,:-1]
y_val = new_data_topN.iloc[:,-1:]
y_val = label_encoder.transform(y_val)

# Make predictions on the test data
y_pred = model.predict(X_val.values[:, :, np.newaxis])
y_pred_labels = (y_pred.argmax(axis=1))

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_labels)
print(f'Accuracy: {accuracy:.4f}')

# Generate classification report
report = classification_report(y_val, y_pred_labels, target_names=label_encoder.classes_)
print(classification_report)
print(report)
