# 미니 프로젝트: 센서 정보 기반 사람 행동 분류 모델 개발
> 스마트폰 센서 정보에 따라 사람 행동을 분류하는 모델을 개발하여 가정 내 위급 상황 식별

<img src = "https://www.snubh.org/upload/ce3/namoimage/images/000073/202111_03_04.png"/>

# 단계 3. 모델링
- 행동 분류 모델의 정확도를 향상시키기위해 모델을 2단계로 구분하여 모델링
  - 정적 / 동적 행동 분류
  - 각 행동 분류

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 필요하다고 판단되는 라이브러리를 추가하세요.
from sklearn.model_selection import train_test_split


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

In [None]:
# 데이터셋 경로
path = 'drive/MyDrive/AIVLE/Mini_project/'

train_data = pd.read_csv(path + 'data01_train.csv')
test_data = pd.read_csv(path + 'data01_train.csv')

In [None]:
# 불필요한 Features 제거
train_data.drop('subject', axis = 1, inplace = True)
test_data.drop('subject', axis = 1, inplace = True)

# 2.데이터 전처리
- 행동 분류 모델을 모델링하기전, 동적 / 정적 분류 모델을 학습하기 위한 Feature 생성

In [None]:
train_data['Activity'].unique()

array(['STANDING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS', 'SITTING'], dtype=object)

In [None]:
# 동적 / 정적 데이터 Features 추가
train_data['Activity_dynamic'] = np.where(train_data['Activity'].isin(['STANDING', 'LAYING', 'SITTING']), 0, 1)
test_data['Activity_dynamic'] = np.where(test_data['Activity'].isin(['STANDING', 'LAYING', 'SITTING']), 0, 1)

In [None]:
target_1 = 'Activity'
target_2 = 'Activity_dynamic'

# Features와 두 Targets
X = train_data.drop([target_1, target_2], axis = 1)
y1 = train_data.loc[:, target_1]
y2 = train_data.loc[:, target_2]

X_train, X_valid, y_train, y_valid = train_test_split(X, y2, test_size = 0.3, random_state=1, stratify = y2)

# **3.단계별 모델링**

## 정적 / 동적 분류 모델

### **모든 Features**

- Validation Data 성능

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth = 7, n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_valid)

print(confusion_matrix(y_valid, y_pred))
print('='*60)
print(classification_report(y_valid, y_pred))

[[970   1]
 [  0 794]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       971
           1       1.00      1.00      1.00       794

    accuracy                           1.00      1765
   macro avg       1.00      1.00      1.00      1765
weighted avg       1.00      1.00      1.00      1765



- Test Data 성능

In [None]:
target_1 = 'Activity'
target_2 = 'Activity_dynamic'

# Features와 두 Targets
X_test = test_data.drop([target_1, target_2], axis = 1)
y1 = test_data.loc[:, target_1]
y2 = test_data.loc[:, target_2]

y_pred = model.predict(X_test)

print(confusion_matrix(y2, y_pred))
print('='*60)
print(classification_report(y2, y_pred))

[[3233    1]
 [   0 2647]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3234
           1       1.00      1.00      1.00      2647

    accuracy                           1.00      5881
   macro avg       1.00      1.00      1.00      5881
weighted avg       1.00      1.00      1.00      5881



### **주요 Features**

- 중요 Features 추출

In [None]:
feature_importance = np.array(model.feature_importances_)
feature_name = np.array(list(X))

temp_dic = {
    'feature_name':feature_name,
    'feature_importance':feature_importance
      }

temp = pd.DataFrame(temp_dic)

# 변수의 특성 중요도 순으로 정렬하기
temp.sort_values(by=['feature_importance'], ascending=False, inplace=True)
temp.reset_index(drop=True, inplace = True)

important = temp

- Validation Data 성능

In [None]:
important = important.loc[:50, 'feature_name']

X_train_if = X_train[important]
X_valid_if = X_valid[important]

model = RandomForestClassifier(max_depth = 7, n_estimators=100, random_state=42)
model.fit(X_train_if, y_train)

y_pred = model.predict(X_valid_if)

print(confusion_matrix(y_valid, y_pred))
print('='*60)
print(classification_report(y_valid, y_pred))

[[970   1]
 [  0 794]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       971
           1       1.00      1.00      1.00       794

    accuracy                           1.00      1765
   macro avg       1.00      1.00      1.00      1765
weighted avg       1.00      1.00      1.00      1765



- Test Data 성능

In [None]:
target_1 = 'Activity'
target_2 = 'Activity_dynamic'

# Features와 두 Targets
X_test = test_data.drop([target_1, target_2], axis = 1)
y1 = test_data.loc[:, target_1]
y2 = test_data.loc[:, target_2]

X_test_if = X_test[important]
y_pred = model.predict(X_test_if)

print(confusion_matrix(y2, y_pred))
print('='*60)
print(classification_report(y2, y_pred))

[[3233    1]
 [   0 2647]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3234
           1       1.00      1.00      1.00      2647

    accuracy                           1.00      5881
   macro avg       1.00      1.00      1.00      5881
weighted avg       1.00      1.00      1.00      5881



## 정적 행동 세부 분류 모델

### 모든 Features

- Validation Data 성능

In [None]:
# 정적 데이터
train_data2 = train_data.loc[train_data['Activity_dynamic'] == 0]

target = 'Activity'

X = train_data2.drop([target,'Activity_dynamic'], axis = 1)
y = train_data2[target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)


model = RandomForestClassifier(max_depth = 7, n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_valid)

print(confusion_matrix(y_valid, y_pred))
print('='*60)
print(classification_report(y_valid, y_pred))

[[223   0   0]
 [  0 188  18]
 [  0   8 210]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       223
     SITTING       0.96      0.91      0.94       206
    STANDING       0.92      0.96      0.94       218

    accuracy                           0.96       647
   macro avg       0.96      0.96      0.96       647
weighted avg       0.96      0.96      0.96       647



- Test Data 성능

In [None]:
# 정적 데이터
test_data2 = test_data.loc[test_data['Activity_dynamic'] == 0]

target = 'Activity'

X_test = test_data2.drop([target,'Activity_dynamic'], axis = 1)
y_test = test_data2[target]

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('='*60)
print(classification_report(y_test, y_pred))

[[1115    0    0]
 [   0  983   49]
 [   0   17 1070]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00      1115
     SITTING       0.98      0.95      0.97      1032
    STANDING       0.96      0.98      0.97      1087

    accuracy                           0.98      3234
   macro avg       0.98      0.98      0.98      3234
weighted avg       0.98      0.98      0.98      3234



### **주요 Features**

- 중요 Features 추출

In [None]:
feature_importance = np.array(model.feature_importances_)
feature_name = np.array(list(X))

temp_dic = {
    'feature_name':feature_name,
    'feature_importance':feature_importance
      }

temp = pd.DataFrame(temp_dic)

# 변수의 특성 중요도 순으로 정렬하기
temp.sort_values(by=['feature_importance'], ascending=False, inplace=True)
temp.reset_index(drop=True, inplace = True)

important = temp

- Validation Data 성능

In [None]:
important = important.loc[:50, 'feature_name']

X_train_if = X_train[important]
X_valid_if = X_valid[important]

model = RandomForestClassifier(max_depth = 7, n_estimators=100, random_state=42)
model.fit(X_train_if, y_train)

y_pred = model.predict(X_valid_if)

print(confusion_matrix(y_valid, y_pred))
print('='*60)
print(classification_report(y_valid, y_pred))

[[223   0   0]
 [  0 187  19]
 [  0   9 209]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       223
     SITTING       0.95      0.91      0.93       206
    STANDING       0.92      0.96      0.94       218

    accuracy                           0.96       647
   macro avg       0.96      0.96      0.96       647
weighted avg       0.96      0.96      0.96       647



- Test Data 성능

In [None]:
test_data2 = test_data.loc[test_data['Activity_dynamic'] == 0]

target_1 = 'Activity'
target_2 = 'Activity_dynamic'

# Features와 두 Targets
X_test = test_data2.drop([target_1, target_2], axis = 1)
y1 = test_data2.loc[:, target_1]
y2 = test_data2.loc[:, target_2]

X_test_if = X_test[important]
y_pred = model.predict(X_test_if)

print(confusion_matrix(y1, y_pred))
print('='*60)
print(classification_report(y1, y_pred))

[[1115    0    0]
 [   0  984   48]
 [   0   36 1051]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00      1115
     SITTING       0.96      0.95      0.96      1032
    STANDING       0.96      0.97      0.96      1087

    accuracy                           0.97      3234
   macro avg       0.97      0.97      0.97      3234
weighted avg       0.97      0.97      0.97      3234



## 동적 동작 세부 분류 모델

### 모든 Features

- Validation Data 성능

In [None]:
# 정적 데이터
train_data2 = train_data.loc[train_data['Activity_dynamic'] == 1]

target = 'Activity'

X = train_data2.drop([target,'Activity_dynamic'], axis = 1)
y = train_data2.loc[:, target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)


model = RandomForestClassifier(max_depth = 7, n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_valid)

print(confusion_matrix(y_valid, y_pred))
print('='*60)
print(classification_report(y_valid, y_pred))

[[192   5   3]
 [  3 151   4]
 [  0   1 171]]
                    precision    recall  f1-score   support

           WALKING       0.98      0.96      0.97       200
WALKING_DOWNSTAIRS       0.96      0.96      0.96       158
  WALKING_UPSTAIRS       0.96      0.99      0.98       172

          accuracy                           0.97       530
         macro avg       0.97      0.97      0.97       530
      weighted avg       0.97      0.97      0.97       530



- Test Data 성능

In [None]:
# 정적 데이터
test_data2 = test_data.loc[test_data['Activity_dynamic'] == 1]

target = 'Activity'

X_test = test_data2.drop([target,'Activity_dynamic'], axis = 1)
y_test = test_data2[target]

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('='*60)
print(classification_report(y_test, y_pred))

[[988   5   5]
 [  4 783   4]
 [  0   1 857]]
                    precision    recall  f1-score   support

           WALKING       1.00      0.99      0.99       998
WALKING_DOWNSTAIRS       0.99      0.99      0.99       791
  WALKING_UPSTAIRS       0.99      1.00      0.99       858

          accuracy                           0.99      2647
         macro avg       0.99      0.99      0.99      2647
      weighted avg       0.99      0.99      0.99      2647



### **주요 Features**

- 중요 Features 추출

In [None]:
feature_importance = np.array(model.feature_importances_)
feature_name = np.array(list(X))

temp_dic = {
    'feature_name':feature_name,
    'feature_importance':feature_importance
      }

temp = pd.DataFrame(temp_dic)

# 변수의 특성 중요도 순으로 정렬하기
temp.sort_values(by=['feature_importance'], ascending=False, inplace=True)
temp.reset_index(drop=True, inplace = True)

important = temp

- Validation Data 성능

In [None]:
important = important.loc[:50, 'feature_name']

X_train_if = X_train[important]
X_valid_if = X_valid[important]

model = RandomForestClassifier(max_depth = 7, n_estimators=100, random_state=42)
model.fit(X_train_if, y_train)

y_pred = model.predict(X_valid_if)

print(confusion_matrix(y_valid, y_pred))
print('='*60)
print(classification_report(y_valid, y_pred))

[[192   3   5]
 [  5 149   4]
 [  0   1 171]]
                    precision    recall  f1-score   support

           WALKING       0.97      0.96      0.97       200
WALKING_DOWNSTAIRS       0.97      0.94      0.96       158
  WALKING_UPSTAIRS       0.95      0.99      0.97       172

          accuracy                           0.97       530
         macro avg       0.97      0.97      0.97       530
      weighted avg       0.97      0.97      0.97       530



- Test Data 성능

In [None]:
test_data2 = test_data.loc[test_data['Activity_dynamic'] == 1]

target_1 = 'Activity'
target_2 = 'Activity_dynamic'

# Features와 두 Targets
X_test = test_data2.drop([target_1, target_2], axis = 1)
y1 = test_data2.loc[:, target_1]
y2 = test_data2.loc[:, target_2]

X_test_if = X_test[important]
y_pred = model.predict(X_test_if)

print(confusion_matrix(y1, y_pred))
print('='*60)
print(classification_report(y1, y_pred))

[[983   6   9]
 [ 10 775   6]
 [  2   1 855]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.98      0.99       998
WALKING_DOWNSTAIRS       0.99      0.98      0.99       791
  WALKING_UPSTAIRS       0.98      1.00      0.99       858

          accuracy                           0.99      2647
         macro avg       0.99      0.99      0.99      2647
      weighted avg       0.99      0.99      0.99      2647



## [추가] 분류 모델 결합 (함수화)


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들어서 분류 모델 합치기

In [None]:
# 데이터셋 경로
path = 'drive/MyDrive/AIVLE/Mini_project/'

train_data = pd.read_csv(path + 'data01_train.csv')
test_data = pd.read_csv(path + 'data01_test.csv')

# Test Data는 실제로는 y 값이 주어지지 않는다.
X_test = test_data.drop(['subject','Activity'], axis = 1)
y_test = test_data.loc[:, 'Activity']

In [None]:
train_data.drop('subject',axis = 1, inplace = True)

In [None]:
# 전체 데이터셋 전처리
def add_feature(train_data):

  # (동적 / 정적) Feature 추가
  train_data['Activity_dynamic'] = np.where(train_data['Activity'].isin(['STANDING', 'LAYING', 'SITTING']), 0, 1)

  return train_data


# (동적 / 정적) 분류 모델 전처리
def model1_preprocessing(train_data):

  # Train 데이터
  X = train_data.drop(['Activity', 'Activity_dynamic'], axis = 1)
  y = train_data.loc[:, 'Activity_dynamic']

  return X, y


# 정적 분류 모델 전처리
def static_model_preprocssing(train_data):

  # 정적 Train 데이터
  static_train_data = train_data.loc[train_data['Activity_dynamic'] == 0]

  X = static_train_data.drop(['Activity'], axis = 1)
  y = static_train_data.loc[:, 'Activity']

  return X, y

# 동적 분류 모델 전처리
def dynamic_model_preprocssing(train_data):

  # 동적 Train 데이터
  dynamic_train_data = train_data.loc[train_data['Activity_dynamic'] == 1]

  X = dynamic_train_data.drop(['Activity'], axis = 1)
  y = dynamic_train_data.loc[:, 'Activity']

  return X, y

def multi_model(train_data, X_test):


  # Feature 추가 (동적/정적 여부)
  train_data = add_feature(train_data)



  # (동적/정적) 분류 모델 데이터 전처리
  X, y = model1_preprocessing(train_data)

  # Train, Valid 분할
  X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state = 1, stratify = y)

  # 첫번째 모델 - (동적/정적) 분류
  model1 = RandomForestClassifier(max_depth=7, n_estimators=100, random_state = 1)
  model1.fit(X_train, y_train)




  # 정적 분류 모델 데이터 전처리
  X, y = static_model_preprocssing(train_data)

  # Train, Valid 분할
  X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state = 1, stratify = y)

  # 두번째 모델 - 정적 행동 분류
  model2 = RandomForestClassifier(max_depth=7, n_estimators=100, random_state = 1)
  model2.fit(X_train, y_train)




  # 동적 분류 모델 데이터 전처리
  X, y = dynamic_model_preprocssing(train_data)

  # Train, Valid 분할
  X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state = 1, stratify = y)

  # 세번째 모델 - 동적 행동 분류
  model3 = RandomForestClassifier(max_depth=7, n_estimators=100, random_state = 1)
  model3.fit(X_train, y_train)




  # Test 예측
  y_pred = model1.predict(X_test)
  X_test['Activity_dynamic'] = y_pred

  X_test_static = X_test.loc[X_test['Activity_dynamic'] == 0]
  X_test_dynamic = X_test.loc[X_test['Activity_dynamic'] == 1]

  y_pred2 = model2.predict(X_test_static)
  X_test_static['Activity'] = y_pred2

  y_pred3 = model3.predict(X_test_dynamic)
  X_test_dynamic['Activity'] = y_pred3

  # 두 결과 결합
  result = pd.concat([X_test_static, X_test_dynamic], axis = 0)
  result = result.sort_index()


  return result

In [None]:
result = multi_model(train_data, X_test)

In [None]:
print(confusion_matrix(y_test, result['Activity']))
print(classification_report(y_test, result['Activity']))
print(accuracy_score(y_test, result['Activity']))

[[291   1   0   0   0   0]
 [  0 235  19   0   0   0]
 [  0  15 272   0   0   0]
 [  0   0   0 223   0   5]
 [  0   0   0   4 188   3]
 [  0   0   0   2   2 211]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       292
           SITTING       0.94      0.93      0.93       254
          STANDING       0.93      0.95      0.94       287
           WALKING       0.97      0.98      0.98       228
WALKING_DOWNSTAIRS       0.99      0.96      0.98       195
  WALKING_UPSTAIRS       0.96      0.98      0.97       215

          accuracy                           0.97      1471
         macro avg       0.97      0.97      0.97      1471
      weighted avg       0.97      0.97      0.97      1471

0.9653297076818491
