# 데이터 준비

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import *
from sklearn.impute import *
from sklearn.metrics import *
import pandas as pd
import numpy as np
import random
import warnings
import copy
import matplotlib.pyplot as plt
import datetime
import re
import math
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('./complete_data(2020 추가).csv')

# 피처정리

In [2]:
# action 데이터의 결측은 모두 0으로 생각합니다.
for col in df.columns:
    if col.startswith('action'):
        df[col].fillna(0, inplace=True)

In [3]:
# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# 0은 0, 나머지는 1로 바꿔줍니다.
df["('Post_sleepProblem',)"] = df["('Post_sleepProblem',)"].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1})

# 4는 0, 나머지는 1로 바꿔줍니다.
df["('Post_dream',)"] = df["('Post_dream',)"].map({4:0, 1:1, 2:1, 3:1})

# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["sleep", "amCondition", "amEmotion"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# 0은 0, 나머지는 1로 바꿔줍니다.
df["sleepProblem"] = df["sleepProblem"].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1})

# "('Post_dream',)"은 4는 0, 나머지는 1로 바꿔줍니다.
df["dream"] = df["dream"].map({4:0, 1:1, 2:1, 3:1})

# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["pmEmotion", "pmStress", "pmFatigue"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# bmi 피처를 생성합니다.
df['bmi'] = df['weight'] / (df['height']/100)**2
df.drop(columns=['height', 'weight'], inplace=True)

In [4]:
# 비슷한 의미를 가진 피처를 통합합니다.
df['action_recreation_media'] = df['action_recreation_media'] + df['action_entertainment'] + df['action_hobby'] + df['action_recreation_etc']
df.drop(columns=['action_entertainment', 'action_hobby', 'action_recreation_etc'], inplace=True)

df['action_community_interaction'] = df['action_community_interaction'] + df['action_socialising']
df.drop(columns=['action_socialising'], inplace=True)

df['place_other_indoor'] = df['place_other_indoor'] + df['place_restaurant']

In [5]:
# 큰 범주 데이터만 포함시키고 Sub는 제거합니다.
df.drop(columns=['actionOption_751', 'actionOption_793', 'actionSubOption_1',
       'actionSubOption_2', 'actionSubOption_3', 'actionSubOption_4',
       'actionSubOption_5', 'conditionSub1Option_1', 'conditionSub1Option_2',
       'conditionSub1Option_3', 'conditionSub1Option_4',
       'conditionSub1Option_5'], inplace=True)

df.drop(columns=['activity_3', 'activity_4', 'activity_5', 'activity_7', 'activity_8'], inplace=True)

In [6]:
# 날짜가 주기성을 가지도록 변경합니다.

# date 칼럼을 datetime 형식으로 변환합니다.
df['date'] = pd.to_datetime(df['date'])
df["('Pre_startDt',)"] = pd.to_datetime(df["('Pre_startDt',)"])
df["('Pre_endDt',)"] = pd.to_datetime(df["('Pre_endDt',)"])

# month 칼럼에 달 정보를 저장합니다.
df['month'] = df['date'].apply(lambda x: x.month)

start_time = df["('Pre_startDt',)"].dt.hour
start_time = df["('Pre_startDt',)"].dt.minute
end_time = df["('Pre_endDt',)"].dt.hour
end_time = df["('Pre_endDt',)"].dt.minute

for i in range(len(start_time)):
    if pd.notnull(start_time[i]):
        hour = start_time[i]
        minute = start_time[i]

        # 30분 이상이면 1시간 추가, 30분 미만이면 버림
        if minute >= 30:
            hour += 1
        else:
            minute = 0

        # 시간 값이 24 이상이면 24를 뺀 나머지 값으로 변경
        hour = hour % 24

        # datetime 객체로 변환하여 저장
        start_time[i] = hour

    if pd.notnull(end_time[i]):
        hour = end_time[i]
        minute = end_time[i]

        # 30분 이상이면 1시간 추가, 30분 미만이면 버림
        if minute >= 30:
            hour += 1
        else:
            minute = 0

        # 시간 값이 24 이상이면 24를 뺀 나머지 값으로 변경
        hour = hour % 24

        # datetime 
        end_time[i] = hour
        
# sin, cos 변환
start_sin = np.sin(2 * np.pi * start_time / 24)
start_cos = np.cos(2 * np.pi * start_time / 24)

end_sin = np.sin(2 * np.pi * end_time / 24)
end_cos = np.cos(2 * np.pi * end_time / 24)

df['pre_start_sin']  = start_sin
df['pre_start_cos']  = start_cos

df['pre_end_sin']  = end_sin
df['pre_end_cos']  = end_cos

# sin, cos 변환
month_sin = np.sin(2 * np.pi * start_time / 24)
month_cos = np.cos(2 * np.pi * start_time / 24)

df['month_sin']  = month_sin
df['month_cos']  = month_cos

df = df.drop(columns=["('Pre_startDt',)", "('Pre_endDt',)", 'month'])

In [7]:
# 피처 형식을 맞춥니다.
df.loc[df['gender'] == 'F', 'gender'] = 0
df.loc[df['gender'] == 'M', 'gender'] = 1
df['gender'] = df['gender'].astype(float)
df['age'] = df['age'].astype(float)

# 데이터셋을 나눕니다

In [8]:
# 타겟 데이터가 결측인 경우는 제거합니다.
target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]

df.dropna(subset=target, inplace=True)
df.drop_duplicates(keep='last', inplace=True)
df.reset_index(inplace=True, drop=True)

In [9]:
# 사용할 피처를 선별하고, target을 정합니다.
now_common = ['userId', 'gender', 'age', 'action_personal_care', 'action_sleep',
          'action_work', 'action_study', 'action_household', 'action_recreation_media',
          'action_outdoor_act','action_community_interaction', 'action_travel', 'action_meal',
          'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor', 'avg_emotionPositive',
          'avg_emotionTension', 'activity_0', 'activity_1', 'activity_2', 'pre_start_sin', 'pre_start_cos', 'pre_end_sin', 'pre_end_cos', 'sleep', 'sleepProblem', 
          'dream', 'amCondition', 'amEmotion', 'pmEmotion', 'pmStress', 'alcohol', 'caffeine', 'condition_ALONE', 'condition_NOT_ALONE', "('Pre_wakeupcount',)"]
now_2018_2019 = ['action_communitiy_interaction', "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)", "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)"]
now_2020 = ['bmi', 'action_care_housemem', 'action_shop', "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)", "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)", "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)", "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)", 'pmFatigue']

next_common = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]

df = df[now_common + now_2018_2019 + now_2020 + next_common]

In [10]:
# 데이터셋을 2018, 2019, 2020으로 분리합니다.
df2018_2019 = df[df['userId'].str.startswith('2018')]
df2018 = df2018_2019[df2018_2019['userId'].str.contains("[a-zA-Z]{3}\d{3}$")]
df2018.dropna(axis=1, how='all', inplace=True)

df2019 = df2018_2019[~df2018_2019['userId'].str.contains("[a-zA-Z]{3}\d{3}$")]
df2019.dropna(axis=1, how='all', inplace=True)

df2020 = df[df['userId'].str.startswith('2020')]
df2020.dropna(axis=1, how='all', inplace=True)

# 향후 사용을 위해 참여 년도를 피처로 추가해둡니다.
df['pat_year_2018'] = [1] * len(df2018) + [0] * len(df2019) + [0] * len(df2020)
df['pat_year_2019'] = [0] * len(df2018) + [1] * len(df2019) + [0] * len(df2020)
df['pat_year_2020'] = [0] * len(df2018) + [0] * len(df2019) + [1] * len(df2020)

In [11]:
# 전체 데이터셋을 저장해둡니다.
total_df = copy.deepcopy(df)

In [12]:
# 범주형 피처와 수치형 피처를 구분합니다.
category = ['gender', 'sleep', 'sleepProblem', 'dream', 'amCondition', 'amEmotion', 'pmEmotion', 'pmStress', 'alcohol', 'caffeine', 'pmFatigue']
numeric = ['age', 'action_personal_care',
           'action_sleep', 'action_communitiy_interaction', 'action_work',
           'action_study', 'action_household', 'action_recreation_media',
           'action_care_housemem', 'action_shop', 'action_outdoor_act',
           'action_community_interaction', 'action_travel', 'action_meal',
           'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor',
           'avg_emotionPositive', 'avg_emotionTension', 'activity_0', 'activity_1',
           'activity_2', 'condition_ALONE', 'condition_NOT_ALONE', 'bmi']

# 데이터 탐색

In [13]:
len(df)

1340

In [14]:
df['sleep'].value_counts()

2.0    633
1.0    378
0.0    329
Name: sleep, dtype: int64

In [15]:
df["sleepProblem"].value_counts()

0.0    720
1.0    620
Name: sleepProblem, dtype: int64

In [16]:
df["dream"].value_counts()

0.0    832
1.0    508
Name: dream, dtype: int64

In [17]:
df["amCondition"].value_counts()

0.0    633
2.0    361
1.0    346
Name: amCondition, dtype: int64

In [18]:
df["amEmotion"].value_counts()

1.0    619
2.0    406
0.0    315
Name: amEmotion, dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 73 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   userId                                     1340 non-null   object 
 1   gender                                     1340 non-null   float64
 2   age                                        1340 non-null   float64
 3   action_personal_care                       1340 non-null   float64
 4   action_sleep                               1340 non-null   float64
 5   action_work                                1340 non-null   float64
 6   action_study                               1340 non-null   float64
 7   action_household                           1340 non-null   float64
 8   action_recreation_media                    1340 non-null   float64
 9   action_outdoor_act                         1340 non-null   float64
 10  action_community_interac

In [20]:
mice_imputer = IterativeImputer()

# category 데이터에서는 최빈값(mode)으로 대체합니다.
for col in category:
    try:
        df2018[col].fillna(df2018[col].mode()[0], inplace=True)
    except:
        continue

df2018_imputed = mice_imputer.fit_transform(df2018.drop(columns=['userId']).values)
df2018_imputed = pd.DataFrame(df2018_imputed, columns=df2018.columns[1:])

In [21]:
mice_imputer = IterativeImputer()

# category 데이터에서는 최빈값(mode)으로 대체합니다.
for col in category:
    try:
        df2019[col].fillna(df2019[col].mode()[0], inplace=True)
    except:
        continue

df2019_imputed = mice_imputer.fit_transform(df2019.drop(columns=['userId']).values)
df2019_imputed = pd.DataFrame(df2019_imputed, columns=df2019.columns[1:])

In [22]:
mice_imputer = IterativeImputer()

# category 데이터에서는 최빈값(mode)으로 대체합니다.
for col in category:
    try:
        df2020[col].fillna(df2020[col].mode()[0], inplace=True)
    except:
        continue

df2020_imputed = mice_imputer.fit_transform(df2020.drop(columns=['userId']).values)
df2020_imputed = pd.DataFrame(df2020_imputed, columns=df2020.columns[1:])

# 단순통합(최대)

In [23]:
# 데이터를 균일하게 나누기 위해 year 컬럼을 생성합니다.
year = []
for id in total_df['userId']:
    if id in df2018['userId'].unique(): year.append('2018')
    elif id in df2019['userId'].unique(): year.append('2019')
    elif id in df2020['userId'].unique(): year.append('2020')
total_df['year'] = year

In [24]:
def test(x):
    global IterativeImputer
    global result_df
    
    # 데이터 준비
    user_ids = total_df["userId"].unique()
    user_years = total_df.groupby('userId')['year'].first()


    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x, stratify=user_years)
    train_total = total_df[total_df["userId"].isin(train_user_ids)]
    test_total = total_df[total_df["userId"].isin(test_user_ids)]
    
    train_total.drop(columns=['userId', 'year'], inplace=True)
    year = test_total['year'].reset_index(drop=True)
    test_total.drop(columns=['userId', 'year'], inplace=True)

    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            test_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue

    # Mice 모델로 결측치를 대체합니다.
    mice_imputer = IterativeImputer()
    try:
        train_total_imputed = mice_imputer.fit_transform(train_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_total_imputed = pd.DataFrame(train_total_imputed, columns=train_total.columns)
    train_total_imputed = train_total_imputed.where(train_total_imputed >= train_total.min(), train_total.min(), axis=1)
    train_total_imputed = train_total_imputed.where(train_total_imputed <= train_total.max(), train_total.max(), axis=1)

    try:
        test_total_imputed = mice_imputer.transform(test_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_total_imputed = pd.DataFrame(test_total_imputed, columns=test_total.columns)
    test_total_imputed = test_total_imputed.where(test_total_imputed >= train_total.min(), train_total.min(), axis=1)
    test_total_imputed = test_total_imputed.where(test_total_imputed <= train_total.max(), train_total.max(), axis=1)
    
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care', 'action_sleep',
           'action_work', 'action_study', 'action_household',
           'action_recreation_media', 'action_care_housemem', 'action_shop',
           'action_outdoor_act', 'action_community_interaction', 'action_travel',
           'action_meal', 'place_home', 'place_workplace', 'place_outdoor',
           'place_other_indoor', 'avg_emotionPositive', 'avg_emotionTension',
           'activity_0', 'activity_1', 'activity_2', 'condition_ALONE',
            "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)", "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)",
           'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)",
            "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)",
            "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)",
            "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)"]
    
    test_total_imputed.reset_index(inplace=True, drop=True)
    test_2018 = test_total_imputed[year =='2018']
    test_2019 = test_total_imputed[year =='2019']
    test_2020 = test_total_imputed[year =='2020']
    
    # 2018
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2018]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]

        X_2018_test = test_2018.drop(target, axis=1)
        y_2018_test = test_2018[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_2018_test[numeric_cols] = (X_2018_test[numeric_cols] - train_mean) / train_std

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_2018_test)
        accuracy = accuracy_score(y_2018_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    # 2019
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2019]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]
        
        X_2019_test = test_2019.drop(target, axis=1)
        y_2019_test = test_2019[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_2019_test[numeric_cols] = (X_2019_test[numeric_cols] - train_mean) / train_std        

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_2019_test)
        accuracy = accuracy_score(y_2019_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    # 2020
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2020]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]
        
        X_2020_test = test_2020.drop(target, axis=1)
        y_2020_test = test_2020[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std       
        X_2020_test[numeric_cols] = (X_2020_test[numeric_cols] - train_mean) / train_std

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_2020_test)
        accuracy = accuracy_score(y_2020_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [25]:
# 결과를 저장할 파데이터 프레임을 초기화합니다.
result_df = pd.DataFrame(columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [26]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

In [27]:
# 각 년도에 대한 변수의 평균과 분산 계산
df_2018 = result_df[result_df['year'] == 2018]
df_2019 = result_df[result_df['year'] == 2019]
df_2020 = result_df[result_df['year'] == 2020]

variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']
for df in [df_2018, df_2019, df_2020]:
    year = df['year'].values[0]
    print(f"\n{year}년도 데이터")
    for var in variables:
        mean = np.mean(df[var])
        variance = math.sqrt(np.var(df[var]))
        print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")


2018년도 데이터
Post_sleep 평균: 55.45, 표준편차: 9.30
Post_sleepProblem 평균: 60.65, 표준편차: 7.70
Post_dream 평균: 67.65, 표준편차: 10.12
Post_amCondition 평균: 48.17, 표준편차: 9.33
Post_amEmotion 평균: 54.54, 표준편차: 8.86

2019년도 데이터
Post_sleep 평균: 39.59, 표준편차: 8.69
Post_sleepProblem 평균: 58.97, 표준편차: 8.94
Post_dream 평균: 64.47, 표준편차: 10.32
Post_amCondition 평균: 50.30, 표준편차: 9.41
Post_amEmotion 평균: 44.65, 표준편차: 8.09

2020년도 데이터
Post_sleep 평균: 43.05, 표준편차: 5.92
Post_sleepProblem 평균: 52.61, 표준편차: 5.67
Post_dream 평균: 60.14, 표준편차: 6.90
Post_amCondition 평균: 43.13, 표준편차: 6.50
Post_amEmotion 평균: 46.00, 표준편차: 6.82


# 단순통합(최소)

In [28]:
# 컬럼들의 교집합을 구하기 위해 df를 복사합니다.
total_df_copy = copy.deepcopy(total_df)

In [29]:
control = list(set(df2018_imputed.columns) & set(df2019_imputed.columns) & set(df2020_imputed.columns))
control.append('userId')
control.append('year')

In [30]:
def test(x):
    global IterativeImputer
    global result_df
    
    # 데이터 준비
    total_df = copy.deepcopy(total_df_copy)
    total_df = total_df[control]
    user_ids = total_df["userId"].unique()
    user_years = total_df.groupby('userId')['year'].first()

    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x, stratify=user_years)
    train_total = total_df[total_df["userId"].isin(train_user_ids)]
    test_total = total_df[total_df["userId"].isin(test_user_ids)]
    
    train_total.drop(columns=['userId', 'year'], inplace=True)
    year = test_total['year'].reset_index(drop=True)
    test_total.drop(columns=['userId', 'year'], inplace=True)

    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            test_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue

    # Mice 모델로 결측치를 대체합니다.
    mice_imputer = IterativeImputer()
    try:
        train_total_imputed = mice_imputer.fit_transform(train_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_total_imputed = pd.DataFrame(train_total_imputed, columns=train_total.columns)
    train_total_imputed = train_total_imputed.where(train_total_imputed >= train_total.min(), train_total.min(), axis=1)
    train_total_imputed = train_total_imputed.where(train_total_imputed <= train_total.max(), train_total.max(), axis=1)

    try:
        test_total_imputed = mice_imputer.transform(test_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_total_imputed = pd.DataFrame(test_total_imputed, columns=test_total.columns)
    test_total_imputed = test_total_imputed.where(test_total_imputed >= train_total.min(), train_total.min(), axis=1)
    test_total_imputed = test_total_imputed.where(test_total_imputed <= train_total.max(), train_total.max(), axis=1)
    
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols_tmp = ['action_personal_care', 'avg_emotionTension'
                     'action_sleep', 'place_other_indoor', 'action_recreation_media', 'place_outdoor',
                     'action_meal', 'place_workplace', 'action_community_interaction', 'activity_0',
                     'condition_NOT_ALONE', 'condition_ALONE', 'action_travel', 'avg_emotionPositive', 'activity_2',
                     'place_home', 'action_study', 'age', 'action_household', 'action_outdoor_act', 'action_work', 'activity_1']
    numeric_cols = []
    for cols_tmp in numeric_cols_tmp:
        if cols_tmp in control: numeric_cols.append(cols_tmp)
        
    test_total_imputed.reset_index(inplace=True, drop=True)
    test_2018 = test_total_imputed[year =='2018']
    test_2019 = test_total_imputed[year =='2019']
    test_2020 = test_total_imputed[year =='2020']
    
    # 2018
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2018]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]

        X_2018_test = test_2018.drop(target, axis=1)
        y_2018_test = test_2018[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_2018_test[numeric_cols] = (X_2018_test[numeric_cols] - train_mean) / train_std

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_2018_test)
        accuracy = accuracy_score(y_2018_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    # 2019
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2019]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]
        
        X_2019_test = test_2019.drop(target, axis=1)
        y_2019_test = test_2019[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_2019_test[numeric_cols] = (X_2019_test[numeric_cols] - train_mean) / train_std        

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_2019_test)
        accuracy = accuracy_score(y_2019_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    # 2020
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2020]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]
        
        X_2020_test = test_2020.drop(target, axis=1)
        y_2020_test = test_2020[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std       
        X_2020_test[numeric_cols] = (X_2020_test[numeric_cols] - train_mean) / train_std

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_2020_test)
        accuracy = accuracy_score(y_2020_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [31]:
# 결과를 저장할 파데이터 프레임을 초기화합니다.
result_df = pd.DataFrame(columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [32]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

In [33]:
# 각 년도에 대한 변수의 평균과 분산 계산
df_2018 = result_df[result_df['year'] == 2018]
df_2019 = result_df[result_df['year'] == 2019]
df_2020 = result_df[result_df['year'] == 2020]

variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']
for df in [df_2018, df_2019, df_2020]:
    year = df['year'].values[0]
    print(f"\n{year}년도 데이터")
    for var in variables:
        mean = np.mean(df[var])
        variance = math.sqrt(np.var(df[var]))
        print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")


2018년도 데이터
Post_sleep 평균: 56.37, 표준편차: 10.45
Post_sleepProblem 평균: 59.67, 표준편차: 7.64
Post_dream 평균: 67.71, 표준편차: 10.15
Post_amCondition 평균: 51.39, 표준편차: 10.32
Post_amEmotion 평균: 56.58, 표준편차: 9.14

2019년도 데이터
Post_sleep 평균: 40.33, 표준편차: 7.98
Post_sleepProblem 평균: 60.70, 표준편차: 8.28
Post_dream 평균: 68.21, 표준편차: 9.44
Post_amCondition 평균: 47.30, 표준편차: 9.21
Post_amEmotion 평균: 44.45, 표준편차: 8.36

2020년도 데이터
Post_sleep 평균: 44.07, 표준편차: 6.97
Post_sleepProblem 평균: 59.59, 표준편차: 4.72
Post_dream 평균: 66.09, 표준편차: 6.08
Post_amCondition 평균: 41.89, 표준편차: 7.09
Post_amEmotion 평균: 46.02, 표준편차: 6.76


# 데이터 통합 준비

In [34]:
def stack(x):
    stack_train1 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m1_3", "m2_1", "m2_2", "m2_3", "m3_1", "m3_2", "m3_3", 'label'])
    stack_test1 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m1_3", "m2_1", "m2_2", "m2_3", "m3_1", "m3_2", "m3_3", 'label'])

    stack_train2 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m2_1", "m2_2", "m3_1", "m3_2", 'label'])
    stack_test2 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m2_1", "m2_2", "m3_1", "m3_2", 'label'])

    # 데이터 준비
    user_ids = total_df["userId"].unique()
    user_years = total_df.groupby('userId')['year'].first()


    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x, stratify=user_years)
    train_total = total_df[total_df["userId"].isin(train_user_ids)]
    test_total = total_df[total_df["userId"].isin(test_user_ids)]

    train_pat = train_total[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].values
    train_total.drop(columns=['userId', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', 'year'], inplace=True)
    test_pat = test_total[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].values
    test_total.drop(columns=['userId', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', 'year'], inplace=True)
    
    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            test_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue

    # Mice 모델로 결측치를 대체합니다.
    mice_imputer = IterativeImputer()
    try:
        train_total_imputed = mice_imputer.fit_transform(train_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_total_imputed = pd.DataFrame(train_total_imputed, columns=train_total.columns)
    train_total_imputed = train_total_imputed.where(train_total_imputed >= train_total.min(), train_total.min(), axis=1)
    train_total_imputed = train_total_imputed.where(train_total_imputed <= train_total.max(), train_total.max(), axis=1)
    train_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']] = train_pat

    try:
        test_total_imputed = mice_imputer.transform(test_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_total_imputed = pd.DataFrame(test_total_imputed, columns=test_total.columns)
    test_total_imputed = test_total_imputed.where(test_total_imputed >= train_total.min(), train_total.min(), axis=1)
    test_total_imputed = test_total_imputed.where(test_total_imputed <= train_total.max(), train_total.max(), axis=1)
    test_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']] = test_pat
    
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care', 'action_sleep',
       'action_work', 'action_study', 'action_household',
       'action_recreation_media', 'action_care_housemem', 'action_shop',
       'action_outdoor_act', 'action_community_interaction', 'action_travel',
       'action_meal', 'place_home', 'place_workplace', 'place_outdoor',
       'place_other_indoor', 'avg_emotionPositive', 'avg_emotionTension',
       'activity_0', 'activity_1', 'activity_2', 'condition_ALONE',
        "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)", "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)",
       'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)",
        "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)",
        "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)",
        "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)"]
    
    # target 변수에 대해 각각 lr 모델을 학습시키고 stack을 쌓습니다.
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var].values
        
        X_train_2018 = train_total_imputed[train_total_imputed['pat_year_2018']==1].drop(target, axis=1)
        train_mean = X_train_2018[numeric_cols].mean()
        train_std = X_train_2018[numeric_cols].std() + 0.000001
        X_train_2018[numeric_cols] = (X_train_2018[numeric_cols] - train_mean) / train_std
        y_train_2018 = train_total_imputed[train_total_imputed['pat_year_2018']==1][target_var].values
        
        X_train_2019 = train_total_imputed[train_total_imputed['pat_year_2019']==1].drop(target, axis=1)
        train_mean = X_train_2019[numeric_cols].mean()
        train_std = X_train_2019[numeric_cols].std() + 0.000001
        X_train_2019[numeric_cols] = (X_train_2019[numeric_cols] - train_mean) / train_std
        y_train_2019 = train_total_imputed[train_total_imputed['pat_year_2019']==1][target_var].values
        
        X_train_2020 = train_total_imputed[train_total_imputed['pat_year_2020']==1].drop(target, axis=1)
        train_mean = X_train_2020[numeric_cols].mean()
        train_std = X_train_2020[numeric_cols].std() + 0.000001
        X_train_2020[numeric_cols] = (X_train_2020[numeric_cols] - train_mean) / train_std
        y_train_2020 = train_total_imputed[train_total_imputed['pat_year_2020']==1][target_var].values
        
        X_test = test_total_imputed.drop(target, axis=1)
        y_test = test_total_imputed[target_var].values

        # 데이터 정규화를 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
        
        # 예측이 3개로 이루어지는 경우와 2개로 나누어지는 경우를 나눕니다
        if target_var in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
            lr1 = LogisticRegression()
            lr1.fit(X_train_2018, y_train_2018)
            
            lr4 = LogisticRegression()
            lr4.fit(X_train_2019, y_train_2019) 
            
            lr7 = LogisticRegression()
            lr7.fit(X_train_2020, y_train_2020)   
            
            X_train_stack1 = lr1.predict_proba(X_train[lr1.feature_names_in_.tolist()])
            X_train_stack2 = lr4.predict_proba(X_train[lr4.feature_names_in_.tolist()])
            X_train_stack3 = lr7.predict_proba(X_train[lr7.feature_names_in_.tolist()])
            
            X_test_stack1 = lr1.predict_proba(X_test[lr1.feature_names_in_.tolist()])
            X_test_stack2 = lr4.predict_proba(X_test[lr4.feature_names_in_.tolist()])
            X_test_stack3 = lr7.predict_proba(X_test[lr7.feature_names_in_.tolist()])
            
            X_train_stack_tmp = np.concatenate(([[x, target_var]] * len(X_train_stack1), train_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_train_stack1, X_train_stack2, X_train_stack3, y_train.reshape(len(X_train_stack1), -1)), axis=1)
            X_train_stack_tmp = pd.DataFrame(X_train_stack_tmp, columns=stack_train1.columns)
            stack_train1 = pd.concat([stack_train1, X_train_stack_tmp], axis=0, ignore_index = True)
             
            X_test_stack_tmp = np.concatenate(([[x, target_var]] * len(X_test_stack1), test_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_test_stack1, X_test_stack2, X_test_stack3, y_test.reshape(len(X_test_stack1), -1)), axis=1)
            X_test_stack_tmp = pd.DataFrame(X_test_stack_tmp, columns=stack_test1.columns)
            stack_test1 = pd.concat([stack_test1, X_test_stack_tmp], axis=0, ignore_index = True)
            
        elif target_var in ["('Post_sleepProblem',)", "('Post_dream',)"]:
            lr2 = LogisticRegression()
            lr2.fit(X_train_2018, y_train_2018)
            
            lr5 = LogisticRegression()
            lr5.fit(X_train_2019, y_train_2019) 
            
            lr8 = LogisticRegression()
            lr8.fit(X_train_2020, y_train_2020)       
        
            X_train_stack1 = lr2.predict_proba(X_train[lr2.feature_names_in_.tolist()])
            X_train_stack2 = lr5.predict_proba(X_train[lr5.feature_names_in_.tolist()])
            X_train_stack3 = lr8.predict_proba(X_train[lr8.feature_names_in_.tolist()])

            X_test_stack1 = lr2.predict_proba(X_test[lr2.feature_names_in_.tolist()])
            X_test_stack2 = lr5.predict_proba(X_test[lr5.feature_names_in_.tolist()])
            X_test_stack3 = lr8.predict_proba(X_test[lr8.feature_names_in_.tolist()])
            
            X_train_stack_tmp = np.concatenate(([[x, target_var]] * len(X_train_stack1), train_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_train_stack1, X_train_stack2, X_train_stack3, y_train.reshape(len(X_train_stack1), -1)), axis=1)
            X_train_stack_tmp = pd.DataFrame(X_train_stack_tmp, columns=stack_train2.columns)
            stack_train2 = pd.concat([stack_train2, X_train_stack_tmp], axis=0, ignore_index = True)
                
            X_test_stack_tmp = np.concatenate(([[x, target_var]] * len(X_test_stack1), test_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_test_stack1, X_test_stack2, X_test_stack3, y_test.reshape(len(X_test_stack1), -1)), axis=1)
            X_test_stack_tmp = pd.DataFrame(X_test_stack_tmp, columns=stack_test2.columns)
            stack_test2 = pd.concat([stack_test2, X_test_stack_tmp], axis=0, ignore_index = True)
        
    return stack_train1, stack_test1, stack_train2, stack_test2

# 데이터 통합1: 보팅

In [35]:
def test(x):
    global result_df

    try:
        stack_train1, stack_test1, stack_train2, stack_test2 = stack(x)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    
    tmp2018_1 = stack_test1[stack_test1['pat_year_2018'].astype('int') == 1]
    tmp2019_1 = stack_test1[stack_test1['pat_year_2019'].astype('int') == 1]
    tmp2020_1 = stack_test1[stack_test1['pat_year_2020'].astype('int') == 1]
    
    tmp2018_2 = stack_test2[stack_test2['pat_year_2018'].astype('int') == 1]
    tmp2019_2 = stack_test2[stack_test2['pat_year_2019'].astype('int') == 1]
    tmp2020_2 = stack_test2[stack_test2['pat_year_2020'].astype('int') == 1]
    
    tmp_result = [x, 2018]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2018_1_tmp = tmp2018_1[tmp2018_1['target_var']==col].reset_index(drop=True)

        df1 = tmp2018_1_tmp.iloc[:,[5, 8, 11]].astype(float).mean(axis=1)
        df2 = tmp2018_1_tmp.iloc[:,[6, 9, 12]].astype(float).mean(axis=1)
        df3 = tmp2018_1_tmp.iloc[:,[7, 10, 13]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(tmp2018_1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2018_2_tmp = tmp2018_2[tmp2018_2['target_var']==col].reset_index(drop=True)
        
        df1 = tmp2018_2_tmp.iloc[:,[5, 7, 9]].astype(float).mean(axis=1)
        df2 = tmp2018_2_tmp.iloc[:,[6, 8, 10]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(tmp2018_2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))
    
    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    tmp_result = [x, 2019]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2019_1_tmp = tmp2019_1[tmp2019_1['target_var']==col].reset_index(drop=True)

        df1 = tmp2019_1_tmp.iloc[:,[5, 8, 11]].astype(float).mean(axis=1)
        df2 = tmp2019_1_tmp.iloc[:,[6, 9, 12]].astype(float).mean(axis=1)
        df3 = tmp2019_1_tmp.iloc[:,[7, 10, 13]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(tmp2019_1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2019_2_tmp = tmp2019_2[tmp2019_2['target_var']==col].reset_index(drop=True)
        
        df1 = tmp2019_2_tmp.iloc[:,[5, 7, 9]].astype(float).mean(axis=1)
        df2 = tmp2019_2_tmp.iloc[:,[6, 8, 10]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(tmp2019_2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))
    
    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    tmp_result = [x, 2020]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2020_1_tmp = tmp2020_1[tmp2020_1['target_var']==col].reset_index(drop=True)

        df1 = tmp2020_1_tmp.iloc[:,[5, 8, 11]].astype(float).mean(axis=1)
        df2 = tmp2020_1_tmp.iloc[:,[6, 9, 12]].astype(float).mean(axis=1)
        df3 = tmp2020_1_tmp.iloc[:,[7, 10, 13]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(tmp2020_1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2020_2_tmp = tmp2020_2[tmp2020_2['target_var']==col].reset_index(drop=True)
        
        df1 = tmp2020_2_tmp.iloc[:,[5, 7, 9]].astype(float).mean(axis=1)
        df2 = tmp2020_2_tmp.iloc[:,[6, 8, 10]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(tmp2020_2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))
    
    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [39]:
# 결과를 저장할 파데이터 프레임을 초기화합니다.
result_df = pd.DataFrame(columns= ['seed', 'year', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [40]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

101에서 오류가 발생했습니다.
101에서 오류가 발생했습니다.


In [41]:
# 각 년도에 대한 변수의 평균과 분산 계산
df_2018 = result_df[result_df['year'] == 2018]
df_2019 = result_df[result_df['year'] == 2019]
df_2020 = result_df[result_df['year'] == 2020]

variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']
for df in [df_2018, df_2019, df_2020]:
    year = df['year'].values[0]
    print(f"\n{year}년도 데이터")
    for var in variables:
        mean = np.mean(df[var])
        variance = math.sqrt(np.var(df[var]))
        print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")


2018년도 데이터
Post_sleep 평균: 50.90, 표준편차: 8.76
Post_sleepProblem 평균: 61.69, 표준편차: 8.47
Post_dream 평균: 66.15, 표준편차: 10.22
Post_amCondition 평균: 44.65, 표준편차: 8.48
Post_amEmotion 평균: 51.67, 표준편차: 9.54

2019년도 데이터
Post_sleep 평균: 40.42, 표준편차: 8.59
Post_sleepProblem 평균: 57.49, 표준편차: 9.08
Post_dream 평균: 66.45, 표준편차: 10.23
Post_amCondition 평균: 49.30, 표준편차: 10.44
Post_amEmotion 평균: 41.66, 표준편차: 8.93

2020년도 데이터
Post_sleep 평균: 40.08, 표준편차: 6.33
Post_sleepProblem 평균: 52.55, 표준편차: 5.77
Post_dream 평균: 57.83, 표준편차: 8.15
Post_amCondition 평균: 40.53, 표준편차: 7.03
Post_amEmotion 평균: 41.27, 표준편차: 6.35


# 데이터 통합2: 가중치 보팅 

In [42]:
def test(x, m1, m2, m3):
    global result_df
    
    try:
        stack_train1, stack_test1, stack_train2, stack_test2 = stack(x)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    
    tmp2018_1 = stack_test1[stack_test1['pat_year_2018'].astype('int') == 1]
    tmp2019_1 = stack_test1[stack_test1['pat_year_2019'].astype('int') == 1]
    tmp2020_1 = stack_test1[stack_test1['pat_year_2020'].astype('int') == 1]
    
    tmp2018_2 = stack_test2[stack_test2['pat_year_2018'].astype('int') == 1]
    tmp2019_2 = stack_test2[stack_test2['pat_year_2019'].astype('int') == 1]
    tmp2020_2 = stack_test2[stack_test2['pat_year_2020'].astype('int') == 1]
    
    tmp_result = [x, 2018]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2018_1_tmp = tmp2018_1[tmp2018_1['target_var']==col].reset_index(drop=True)
        
        tmp2018_1_tmp[['m1_1', 'm1_2', 'm1_3', 'm2_1', 'm2_2', 'm2_3', 'm3_1', 'm3_2', 'm3_3']] = tmp2018_1_tmp[['m1_1', 'm1_2', 'm1_3', 'm2_1', 'm2_2', 'm2_3', 'm3_1', 'm3_2', 'm3_3']].astype('float')
        tmp2018_1_tmp[['m1_1', 'm1_2', 'm1_3']] *= m1
        
        df1 = (tmp2018_1_tmp['m1_1'] + tmp2018_1_tmp['m2_1'] + tmp2018_1_tmp['m3_1']) / 3
        df2 = (tmp2018_1_tmp['m1_2'] + tmp2018_1_tmp['m2_2'] + tmp2018_1_tmp['m3_2']) / 3
        df3 = (tmp2018_1_tmp['m1_3'] + tmp2018_1_tmp['m2_3'] + tmp2018_1_tmp['m3_3']) / 3

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(tmp2018_1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2018_2_tmp = tmp2018_2[tmp2018_2['target_var']==col].reset_index(drop=True)

        tmp2018_2_tmp[['m1_1', 'm1_2', 'm2_1', 'm2_2', 'm3_1', 'm3_2']] = tmp2018_2_tmp[['m1_1', 'm1_2', 'm2_1', 'm2_2', 'm3_1', 'm3_2']].astype('float')
        tmp2018_2_tmp[['m1_1', 'm1_2']] *= m1
        
        df1 = (tmp2018_2_tmp['m1_1'] + tmp2018_2_tmp['m2_1'] +tmp2018_2_tmp['m3_1']) / 3
        df2 = (tmp2018_2_tmp['m1_2'] + tmp2018_2_tmp['m2_2'] +tmp2018_2_tmp['m3_2']) / 3

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(tmp2018_2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    tmp_result = [x, 2019]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2019_1_tmp = tmp2019_1[tmp2019_1['target_var']==col].reset_index(drop=True)
        
        tmp2019_1_tmp[['m1_1', 'm1_2', 'm1_3', 'm2_1', 'm2_2', 'm2_3', 'm3_1', 'm3_2', 'm3_3']] = tmp2019_1_tmp[['m1_1', 'm1_2', 'm1_3', 'm2_1', 'm2_2', 'm2_3', 'm3_1', 'm3_2', 'm3_3']].astype('float')
        tmp2019_1_tmp[['m2_1', 'm2_2', 'm2_3']] *= m2
        
        df1 = (tmp2019_1_tmp['m1_1'] + tmp2019_1_tmp['m2_1'] + tmp2019_1_tmp['m3_1']) / 3
        df2 = (tmp2019_1_tmp['m1_2'] + tmp2019_1_tmp['m2_2'] + tmp2019_1_tmp['m3_2']) / 3
        df3 = (tmp2019_1_tmp['m1_3'] + tmp2019_1_tmp['m2_3'] + tmp2019_1_tmp['m3_3']) / 3

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(tmp2019_1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2019_2_tmp = tmp2019_2[tmp2019_2['target_var']==col].reset_index(drop=True)

        tmp2019_2_tmp[['m1_1', 'm1_2', 'm2_1', 'm2_2', 'm3_1', 'm3_2']] = tmp2019_2_tmp[['m1_1', 'm1_2', 'm2_1', 'm2_2', 'm3_1', 'm3_2']].astype('float')
        tmp2019_2_tmp[['m2_1', 'm2_2']] *= m2
        
        df1 = (tmp2019_2_tmp['m1_1'] + tmp2019_2_tmp['m2_1'] +tmp2019_2_tmp['m3_1']) / 3
        df2 = (tmp2019_2_tmp['m1_2'] + tmp2019_2_tmp['m2_2'] +tmp2019_2_tmp['m3_2']) / 3

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(tmp2019_2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    tmp_result = [x, 2020]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2020_1_tmp = tmp2020_1[tmp2020_1['target_var']==col].reset_index(drop=True)
        
        tmp2020_1_tmp[['m1_1', 'm1_2', 'm1_3', 'm2_1', 'm2_2', 'm2_3', 'm3_1', 'm3_2', 'm3_3']] = tmp2020_1_tmp[['m1_1', 'm1_2', 'm1_3', 'm2_1', 'm2_2', 'm2_3', 'm3_1', 'm3_2', 'm3_3']].astype('float')
        tmp2020_1_tmp[['m3_1', 'm3_2', 'm2_3']] *= m3
        
        df1 = (tmp2020_1_tmp['m1_1'] + tmp2020_1_tmp['m2_1'] + tmp2020_1_tmp['m3_1']) / 3
        df2 = (tmp2020_1_tmp['m1_2'] + tmp2020_1_tmp['m2_2'] + tmp2020_1_tmp['m3_2']) / 3
        df3 = (tmp2020_1_tmp['m1_3'] + tmp2020_1_tmp['m2_3'] + tmp2020_1_tmp['m3_3']) / 3

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(tmp2020_1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2020_2_tmp = tmp2020_2[tmp2020_2['target_var']==col].reset_index(drop=True)

        tmp2020_2_tmp[['m1_1', 'm1_2', 'm2_1', 'm2_2', 'm3_1', 'm3_2']] = tmp2020_2_tmp[['m1_1', 'm1_2', 'm2_1', 'm2_2', 'm3_1', 'm3_2']].astype('float')
        tmp2020_2_tmp[['m3_1', 'm3_2']] *= m3
        
        df1 = (tmp2020_2_tmp['m1_1'] + tmp2020_2_tmp['m2_1'] +tmp2020_2_tmp['m3_1']) / 3
        df2 = (tmp2020_2_tmp['m1_2'] + tmp2020_2_tmp['m2_2'] +tmp2020_2_tmp['m3_2']) / 3

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(tmp2020_2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [43]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=['seed', 'year', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [44]:
# 10개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i, 1.5, 1.5, 1.5)

101에서 오류가 발생했습니다.
101에서 오류가 발생했습니다.


In [45]:
# 각 년도에 대한 변수의 평균과 분산 계산
df_2018 = result_df[result_df['year'] == 2018]
df_2019 = result_df[result_df['year'] == 2019]
df_2020 = result_df[result_df['year'] == 2020]

variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']
for df in [df_2018, df_2019, df_2020]:
    year = df['year'].values[0]
    print(f"\n{year}년도 데이터")
    for var in variables:
        mean = np.mean(df[var])
        variance = math.sqrt(np.var(df[var]))
        print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")


2018년도 데이터
Post_sleep 평균: 50.79, 표준편차: 8.83
Post_sleepProblem 평균: 62.16, 표준편차: 8.44
Post_dream 평균: 66.60, 표준편차: 9.60
Post_amCondition 평균: 44.49, 표준편차: 8.45
Post_amEmotion 평균: 52.75, 표준편차: 9.83

2019년도 데이터
Post_sleep 평균: 38.75, 표준편차: 8.55
Post_sleepProblem 평균: 56.67, 표준편차: 8.73
Post_dream 평균: 67.51, 표준편차: 10.24
Post_amCondition 평균: 49.49, 표준편차: 10.29
Post_amEmotion 평균: 40.70, 표준편차: 9.06

2020년도 데이터
Post_sleep 평균: 40.46, 표준편차: 6.41
Post_sleepProblem 평균: 53.45, 표준편차: 5.92
Post_dream 평균: 59.16, 표준편차: 7.45
Post_amCondition 평균: 41.45, 표준편차: 7.21
Post_amEmotion 평균: 41.26, 표준편차: 6.25


# 데이터 통합3: 스태킹

In [46]:
def test(x):
    global result_df
        
    try:
        stack_train1, stack_test1, stack_train2, stack_test2 = stack(x)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    
    tmp2018_train_1 = stack_train1[stack_train1['pat_year_2018'].astype('int') == 1]
    tmp2019_train_1 = stack_train1[stack_train1['pat_year_2019'].astype('int') == 1]
    tmp2020_train_1 = stack_train1[stack_train1['pat_year_2020'].astype('int') == 1]
    
    tmp2018_train_2 = stack_train2[stack_train2['pat_year_2018'].astype('int') == 1]
    tmp2019_train_2 = stack_train2[stack_train2['pat_year_2019'].astype('int') == 1]
    tmp2020_train_2 = stack_train2[stack_train2['pat_year_2020'].astype('int') == 1]
    
    tmp2018_test_1 = stack_test1[stack_test1['pat_year_2018'].astype('int') == 1]
    tmp2019_test_1 = stack_test1[stack_test1['pat_year_2019'].astype('int') == 1]
    tmp2020_test_1 = stack_test1[stack_test1['pat_year_2020'].astype('int') == 1]
    
    tmp2018_test_2 = stack_test2[stack_test2['pat_year_2018'].astype('int') == 1]
    tmp2019_test_2 = stack_test2[stack_test2['pat_year_2019'].astype('int') == 1]
    tmp2020_test_2 = stack_test2[stack_test2['pat_year_2020'].astype('int') == 1]    

    
    tmp_result = [x, 2018]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2018_train_1_tmp = tmp2018_train_1[tmp2018_train_1['target_var']==col].reset_index(drop=True)
        tmp2018_test_1_tmp = tmp2018_test_1[tmp2018_test_1['target_var']==col].reset_index(drop=True)
        
        X_train = tmp2018_train_1_tmp.iloc[:,5:-1]
        y_train = tmp2018_train_1_tmp.iloc[:,-1]
        
        X_test = tmp2018_test_1_tmp.iloc[:,5:-1]
        y_test = tmp2018_test_1_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2018_train_2_tmp = tmp2018_train_2[tmp2018_train_2['target_var']==col].reset_index(drop=True)
        tmp2018_test_2_tmp = tmp2018_test_2[tmp2018_test_2['target_var']==col].reset_index(drop=True)
        
        X_train = tmp2018_train_2_tmp.iloc[:,5:-1]
        y_train = tmp2018_train_2_tmp.iloc[:,-1]
        
        X_test = tmp2018_test_2_tmp.iloc[:,5:-1]
        y_test = tmp2018_test_2_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))


    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)

    tmp_result = [x, 2019]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2019_train_1_tmp = tmp2019_train_1[tmp2019_train_1['target_var']==col].reset_index(drop=True)
        tmp2019_test_1_tmp = tmp2019_test_1[tmp2019_test_1['target_var']==col].reset_index(drop=True)
        
        X_train = tmp2019_train_1_tmp.iloc[:,5:-1]
        y_train = tmp2019_train_1_tmp.iloc[:,-1]
        
        X_test = tmp2019_test_1_tmp.iloc[:,5:-1]
        y_test = tmp2019_test_1_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2019_train_2_tmp = tmp2019_train_2[tmp2019_train_2['target_var']==col].reset_index(drop=True)
        tmp2019_test_2_tmp = tmp2019_test_2[tmp2019_test_2['target_var']==col].reset_index(drop=True)
        
        X_train = tmp2019_train_2_tmp.iloc[:,5:-1]
        y_train = tmp2019_train_2_tmp.iloc[:,-1]
        
        X_test = tmp2019_test_2_tmp.iloc[:,5:-1]
        y_test = tmp2019_test_2_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))


    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    tmp_result = [x, 2020]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        tmp2020_train_1_tmp = tmp2020_train_1[tmp2020_train_1['target_var']==col].reset_index(drop=True)
        tmp2020_test_1_tmp = tmp2020_test_1[tmp2020_test_1['target_var']==col].reset_index(drop=True)
        
        X_train = tmp2020_train_1_tmp.iloc[:,5:-1]
        y_train = tmp2020_train_1_tmp.iloc[:,-1]
        
        X_test = tmp2020_test_1_tmp.iloc[:,5:-1]
        y_test = tmp2020_test_1_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        tmp2020_train_2_tmp = tmp2020_train_2[tmp2020_train_2['target_var']==col].reset_index(drop=True)
        tmp2020_test_2_tmp = tmp2020_test_2[tmp2020_test_2['target_var']==col].reset_index(drop=True)
        
        X_train = tmp2020_train_2_tmp.iloc[:,5:-1]
        y_train = tmp2020_train_2_tmp.iloc[:,-1]
        
        X_test = tmp2020_test_2_tmp.iloc[:,5:-1]
        y_test = tmp2020_test_2_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))


    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [47]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=['seed', 'year', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [48]:
# 10개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

101에서 오류가 발생했습니다.
101에서 오류가 발생했습니다.


In [49]:
# 각 년도에 대한 변수의 평균과 분산 계산
df_2018 = result_df[result_df['year'] == 2018]
df_2019 = result_df[result_df['year'] == 2019]
df_2020 = result_df[result_df['year'] == 2020]

variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']
for df in [df_2018, df_2019, df_2020]:
    year = df['year'].values[0]
    print(f"\n{year}년도 데이터")
    for var in variables:
        mean = np.mean(df[var])
        variance = math.sqrt(np.var(df[var]))
        print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")


2018년도 데이터
Post_sleep 평균: 50.60, 표준편차: 9.52
Post_sleepProblem 평균: 60.27, 표준편차: 8.11
Post_dream 평균: 67.02, 표준편차: 8.03
Post_amCondition 평균: 41.83, 표준편차: 8.23
Post_amEmotion 평균: 49.42, 표준편차: 9.42

2019년도 데이터
Post_sleep 평균: 35.09, 표준편차: 7.42
Post_sleepProblem 평균: 54.99, 표준편차: 7.85
Post_dream 평균: 67.73, 표준편차: 9.81
Post_amCondition 평균: 47.30, 표준편차: 8.45
Post_amEmotion 평균: 38.04, 표준편차: 8.47

2020년도 데이터
Post_sleep 평균: 42.89, 표준편차: 6.56
Post_sleepProblem 평균: 52.11, 표준편차: 6.86
Post_dream 평균: 58.44, 표준편차: 7.20
Post_amCondition 평균: 41.38, 표준편차: 6.34
Post_amEmotion 평균: 41.62, 표준편차: 6.94
