# 데이터 준비

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import *
from sklearn.impute import *
from sklearn.metrics import *
import pandas as pd
import numpy as np
import random
import warnings
import copy
import matplotlib.pyplot as plt
import datetime
import re
import math
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('./complete_data(2020 추가).csv')

# 피처정리

In [2]:
# action 데이터의 결측은 모두 0으로 생각합니다.
for col in df.columns:
    if col.startswith('action'):
        df[col].fillna(0, inplace=True)

In [3]:
# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# 0은 0, 나머지는 1로 바꿔줍니다.
df["('Post_sleepProblem',)"] = df["('Post_sleepProblem',)"].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1})

# 4는 0, 나머지는 1로 바꿔줍니다.
df["('Post_dream',)"] = df["('Post_dream',)"].map({4:0, 1:1, 2:1, 3:1})

# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["sleep", "amCondition", "amEmotion"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# 0은 0, 나머지는 1로 바꿔줍니다.
df["sleepProblem"] = df["sleepProblem"].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1})

# "('Post_dream',)"은 4는 0, 나머지는 1로 바꿔줍니다.
df["dream"] = df["dream"].map({4:0, 1:1, 2:1, 3:1})

# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["pmEmotion", "pmStress", "pmFatigue"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# bmi 피처를 생성합니다.
df['bmi'] = df['weight'] / (df['height']/100)**2
df.drop(columns=['height', 'weight'], inplace=True)

In [4]:
# 비슷한 의미를 가진 피처를 통합합니다.
df['action_recreation_media'] = df['action_recreation_media'] + df['action_entertainment'] + df['action_hobby'] + df['action_recreation_etc']
df.drop(columns=['action_entertainment', 'action_hobby', 'action_recreation_etc'], inplace=True)

df['action_community_interaction'] = df['action_community_interaction'] + df['action_socialising']
df.drop(columns=['action_socialising'], inplace=True)

df['place_other_indoor'] = df['place_other_indoor'] + df['place_restaurant']

In [5]:
# 큰 범주 데이터만 포함시키고 Sub는 제거합니다.
df.drop(columns=['actionOption_751', 'actionOption_793', 'actionSubOption_1',
       'actionSubOption_2', 'actionSubOption_3', 'actionSubOption_4',
       'actionSubOption_5', 'conditionSub1Option_1', 'conditionSub1Option_2',
       'conditionSub1Option_3', 'conditionSub1Option_4',
       'conditionSub1Option_5'], inplace=True)

df.drop(columns=['activity_3', 'activity_4', 'activity_5', 'activity_7', 'activity_8'], inplace=True)

In [6]:
# 날짜가 주기성을 가지도록 변경합니다.

# date 칼럼을 datetime 형식으로 변환합니다.
df['date'] = pd.to_datetime(df['date'])
df["('Pre_startDt',)"] = pd.to_datetime(df["('Pre_startDt',)"])
df["('Pre_endDt',)"] = pd.to_datetime(df["('Pre_endDt',)"])

# month 칼럼에 달 정보를 저장합니다.
df['month'] = df['date'].apply(lambda x: x.month)

start_time = df["('Pre_startDt',)"].dt.hour
start_time = df["('Pre_startDt',)"].dt.minute
end_time = df["('Pre_endDt',)"].dt.hour
end_time = df["('Pre_endDt',)"].dt.minute

for i in range(len(start_time)):
    if pd.notnull(start_time[i]):
        hour = start_time[i]
        minute = start_time[i]

        # 30분 이상이면 1시간 추가, 30분 미만이면 버림
        if minute >= 30:
            hour += 1
        else:
            minute = 0

        # 시간 값이 24 이상이면 24를 뺀 나머지 값으로 변경
        hour = hour % 24

        # datetime 객체로 변환하여 저장
        start_time[i] = hour

    if pd.notnull(end_time[i]):
        hour = end_time[i]
        minute = end_time[i]

        # 30분 이상이면 1시간 추가, 30분 미만이면 버림
        if minute >= 30:
            hour += 1
        else:
            minute = 0

        # 시간 값이 24 이상이면 24를 뺀 나머지 값으로 변경
        hour = hour % 24

        # datetime 
        end_time[i] = hour
        
# sin, cos 변환
start_sin = np.sin(2 * np.pi * start_time / 24)
start_cos = np.cos(2 * np.pi * start_time / 24)

end_sin = np.sin(2 * np.pi * end_time / 24)
end_cos = np.cos(2 * np.pi * end_time / 24)

df['pre_start_sin']  = start_sin
df['pre_start_cos']  = start_cos

df['pre_end_sin']  = end_sin
df['pre_end_cos']  = end_cos

# sin, cos 변환
month_sin = np.sin(2 * np.pi * start_time / 24)
month_cos = np.cos(2 * np.pi * start_time / 24)

df['month_sin']  = month_sin
df['month_cos']  = month_cos

df = df.drop(columns=["('Pre_startDt',)", "('Pre_endDt',)", 'month'])

In [7]:
# 피처 형식을 맞춥니다.
df.loc[df['gender'] == 'F', 'gender'] = 0
df.loc[df['gender'] == 'M', 'gender'] = 1
df['gender'] = df['gender'].astype(float)
df['age'] = df['age'].astype(float)

# 데이터셋을 나눕니다

In [8]:
# 타겟 데이터가 결측인 경우는 제거합니다.
target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]

df.dropna(subset=target, inplace=True)
df.drop_duplicates(keep='last', inplace=True)
df.reset_index(inplace=True, drop=True)

In [9]:
# 사용할 피처를 선별하고, target을 정합니다.
now_common = ['userId', 'gender', 'age', 'action_personal_care', 'action_sleep',
          'action_work', 'action_study', 'action_household', 'action_recreation_media',
          'action_outdoor_act','action_community_interaction', 'action_travel', 'action_meal',
          'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor', 'avg_emotionPositive',
          'avg_emotionTension', 'activity_0', 'activity_1', 'activity_2', 'pre_start_sin', 'pre_start_cos', 'pre_end_sin', 'pre_end_cos', 'sleep', 'sleepProblem', 
          'dream', 'amCondition', 'amEmotion', 'pmEmotion', 'pmStress', 'alcohol', 'caffeine', 'condition_ALONE', 'condition_NOT_ALONE', "('Pre_wakeupcount',)"]
now_2018_2019 = ['action_communitiy_interaction', "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)", "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)"]
now_2020 = ['bmi', 'action_care_housemem', 'action_shop', "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)", "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)", "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)", "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)", 'pmFatigue']

next_common = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]

df = df[now_common + now_2018_2019 + now_2020 + next_common]

In [10]:
# 데이터셋을 2018, 2019, 2020으로 분리합니다.
df2018_2019 = df[df['userId'].str.startswith('2018')]
df2018 = df2018_2019[df2018_2019['userId'].str.contains("[a-zA-Z]{3}\d{3}$")]
df2018.dropna(axis=1, how='all', inplace=True)

df2019 = df2018_2019[~df2018_2019['userId'].str.contains("[a-zA-Z]{3}\d{3}$")]
df2019.dropna(axis=1, how='all', inplace=True)

df2020 = df[df['userId'].str.startswith('2020')]
df2020.dropna(axis=1, how='all', inplace=True)

# 향후 사용을 위해 참여 년도를 피처로 추가해둡니다.
df['pat_year_2018'] = [1] * len(df2018) + [0] * len(df2019) + [0] * len(df2020)
df['pat_year_2019'] = [0] * len(df2018) + [1] * len(df2019) + [0] * len(df2020)
df['pat_year_2020'] = [0] * len(df2018) + [0] * len(df2019) + [1] * len(df2020)

In [11]:
# 전체 데이터셋을 저장해둡니다.
total_df = copy.deepcopy(df)

In [12]:
# 범주형 피처와 수치형 피처를 구분합니다.
category = ['gender', 'sleep', 'sleepProblem', 'dream', 'amCondition', 'amEmotion', 'pmEmotion', 'pmStress', 'alcohol', 'caffeine', 'pmFatigue']
numeric = ['age', 'action_personal_care',
           'action_sleep', 'action_communitiy_interaction', 'action_work',
           'action_study', 'action_household', 'action_recreation_media',
           'action_care_housemem', 'action_shop', 'action_outdoor_act',
           'action_community_interaction', 'action_travel', 'action_meal',
           'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor',
           'avg_emotionPositive', 'avg_emotionTension', 'activity_0', 'activity_1',
           'activity_2', 'condition_ALONE', 'condition_NOT_ALONE', 'bmi']

# 단일모델

In [13]:
def test(x):
    global IterativeImputer
    global result_df
    
    # 2018데이터 준비
    user_ids = df2018["userId"].unique()
    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x)

    train_2018 = df2018[df2018["userId"].isin(train_user_ids)]
    test_2018 = df2018[df2018["userId"].isin(test_user_ids)]
    
    train_2018.drop(columns=['userId'], inplace=True)
    test_2018.drop(columns=['userId'], inplace=True)
    
    # 2019데이터 준비
    user_ids = df2019["userId"].unique()
    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x)

    train_2019 = df2019[df2019["userId"].isin(train_user_ids)]
    test_2019 = df2019[df2019["userId"].isin(test_user_ids)]
    
    train_2019.drop(columns=['userId'], inplace=True)
    test_2019.drop(columns=['userId'], inplace=True)

    # 2020데이터 준비
    user_ids = df2020["userId"].unique()
    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x)

    train_2020 = df2020[df2020["userId"].isin(train_user_ids)]
    test_2020 = df2020[df2020["userId"].isin(test_user_ids)]

    train_2020.drop(columns=['userId'], inplace=True)
    test_2020.drop(columns=['userId'], inplace=True)
    
    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_2018[col].fillna(train_2018[col].mode()[0], inplace=True)
            test_2018[col].fillna(train_2018[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            train_2019[col].fillna(train_2019[col].mode()[0], inplace=True)
            test_2019[col].fillna(train_2019[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            train_2020[col].fillna(train_2020[col].mode()[0], inplace=True)
            test_2020[col].fillna(train_2020[col].mode()[0], inplace=True)
        except:
            continue           
    
    # Mice 모델로 결측치를 대체합니다.
    # 2018
    mice_imputer = IterativeImputer()
    try:
        train_2018_imputed = mice_imputer.fit_transform(train_2018.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_2018_imputed = pd.DataFrame(train_2018_imputed, columns=train_2018.columns)
    train_2018_imputed = train_2018_imputed.where(train_2018_imputed >= train_2018.min(), train_2018.min(), axis=1)
    train_2018_imputed = train_2018_imputed.where(train_2018_imputed <= train_2018.max(), train_2018.max(), axis=1)
    
    try:
        test_2018_imputed = mice_imputer.transform(test_2018.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_2018_imputed = pd.DataFrame(test_2018_imputed, columns=test_2018.columns)
    test_2018_imputed = test_2018_imputed.where(test_2018_imputed >= train_2018.min(), train_2018.min(), axis=1)
    test_2018_imputed = test_2018_imputed.where(test_2018_imputed <= train_2018.max(), train_2018.max(), axis=1)
    
    # 2019
    mice_imputer = IterativeImputer()
    try:
        train_2019_imputed = mice_imputer.fit_transform(train_2019.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_2019_imputed = pd.DataFrame(train_2019_imputed, columns=train_2019.columns)
    train_2019_imputed = train_2019_imputed.where(train_2019_imputed >= train_2019.min(), train_2019.min(), axis=1)
    train_2019_imputed = train_2019_imputed.where(train_2019_imputed <= train_2019.max(), train_2019.max(), axis=1)

    try:
        test_2019_imputed = mice_imputer.transform(test_2019.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_2019_imputed = pd.DataFrame(test_2019_imputed, columns=test_2019.columns)
    test_2019_imputed = test_2019_imputed.where(test_2019_imputed >= train_2019.min(), train_2019.min(), axis=1)
    test_2019_imputed = test_2019_imputed.where(test_2019_imputed <= train_2019.max(), train_2019.max(), axis=1)
    
    # 2020
    mice_imputer = IterativeImputer()
    try:
        train_2020_imputed = mice_imputer.fit_transform(train_2020.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_2020_imputed = pd.DataFrame(train_2020_imputed, columns=train_2020.columns)
    train_2020_imputed = train_2020_imputed.where(train_2020_imputed >= train_2020.min(), train_2020.min(), axis=1)
    train_2020_imputed = train_2020_imputed.where(train_2020_imputed <= train_2020.max(), train_2020.max(), axis=1)

    try:
        test_2020_imputed = mice_imputer.transform(test_2020.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_2020_imputed = pd.DataFrame(test_2020_imputed, columns=test_2020.columns)
    test_2020_imputed = test_2020_imputed.where(test_2020_imputed >= train_2020.min(), train_2020.min(), axis=1)
    test_2020_imputed = test_2020_imputed.where(test_2020_imputed <= train_2020.max(), train_2020.max(), axis=1)
    
    # 2018
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care',
           'action_sleep', 'action_communitiy_interaction', 'action_work',
           'action_study', 'action_household', 'action_recreation_media', 'action_outdoor_act',
           'action_community_interaction', 'action_travel', 'action_meal',
           'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor',
           'avg_emotionPositive', 'avg_emotionTension', 'activity_0', 'activity_1',
           'activity_2', 'condition_ALONE', 'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)",
            "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)"]
    
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2018]
    for target_var in target:
        y_pred = []
        
        train_2018 = copy.deepcopy(train_2018_imputed)
        test_2018 = copy.deepcopy(test_2018_imputed)
        
        while len(test_2018) > 0:
            X_train = train_2018.drop(target, axis=1)
            y_train = train_2018[target_var]

            X_test = pd.DataFrame([test_2018.drop(target, axis=1).iloc[0]], columns=X_train.columns)
            y_test = test_2018[target_var].iloc[0]

            # 정규화 과정을 진행합니다.
            train_mean = X_train[numeric_cols].mean()
            train_std = X_train[numeric_cols].std() + 0.000001

            X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
            X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
            
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            y_pred.append(lr.predict(X_test).tolist()[0])
            
            train_2018 = train_2018.append(pd.DataFrame([test_2018.iloc[0]], columns=train_2018_imputed.columns))
            test_2018 = test_2018.iloc[1:]
        
        accuracy = accuracy_score(test_2018_imputed[target_var], y_pred)
        accuracy = round(accuracy * 100, 0)
        tmp_result.append(accuracy)

    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    # 2019
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care',
           'action_sleep', 'action_communitiy_interaction', 'action_work',
           'action_study', 'action_household', 'action_recreation_media', 'action_outdoor_act',
           'action_community_interaction', 'action_travel', 'action_meal',
           'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor',
           'avg_emotionPositive', 'avg_emotionTension', 'activity_0', 'activity_1',
           'activity_2', 'condition_ALONE', 'condition_NOT_ALONE', "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)"]
    
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2019]
    for target_var in target:
        y_pred = []
        
        train_2019 = copy.deepcopy(train_2019_imputed)
        test_2019 = copy.deepcopy(test_2019_imputed)
        
        while len(test_2019) > 0:
            X_train = train_2019.drop(target, axis=1)
            y_train = train_2019[target_var]

            X_test = pd.DataFrame([test_2019.drop(target, axis=1).iloc[0]], columns=X_train.columns)
            y_test = test_2019[target_var].iloc[0]

            # 정규화 과정을 진행합니다.
            train_mean = X_train[numeric_cols].mean()
            train_std = X_train[numeric_cols].std() + 0.000001

            X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
            X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
            
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            y_pred.append(lr.predict(X_test).tolist()[0])
            
            train_2019 = train_2019.append(pd.DataFrame([test_2019.iloc[0]], columns=train_2019_imputed.columns))
            test_2019 = test_2019.iloc[1:]

        accuracy = accuracy_score(test_2019_imputed[target_var], y_pred)
        accuracy = round(accuracy * 100, 0)
        tmp_result.append(accuracy)

    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

    # 2020
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care', 'action_sleep',
           'action_work', 'action_study', 'action_household',
           'action_recreation_media', 'action_care_housemem', 'action_shop',
           'action_outdoor_act', 'action_community_interaction', 'action_travel',
           'action_meal', 'place_home', 'place_workplace', 'place_outdoor',
           'place_other_indoor', 'avg_emotionPositive', 'avg_emotionTension',
           'activity_0', 'activity_1', 'activity_2', 'condition_ALONE',
           'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)",
            "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)",
            "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)",
            "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)"]

    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2020]
    for target_var in target:
        y_pred = []
        
        train_2020 = copy.deepcopy(train_2020_imputed)
        test_2020 = copy.deepcopy(test_2020_imputed)
        
        while len(test_2020) > 0:
            X_train = train_2020.drop(target, axis=1)
            y_train = train_2020[target_var]

            X_test = pd.DataFrame([test_2020.drop(target, axis=1).iloc[0]], columns=X_train.columns)
            y_test = test_2020[target_var].iloc[0]

            # 정규화 과정을 진행합니다.
            train_mean = X_train[numeric_cols].mean()
            train_std = X_train[numeric_cols].std() + 0.000001

            X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
            X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
            
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            y_pred.append(lr.predict(X_test).tolist()[0])
            
            train_2020 = train_2020.append(pd.DataFrame([test_2020.iloc[0]], columns=train_2020_imputed.columns))
            test_2020 = test_2020.iloc[1:]
        
        accuracy = accuracy_score(test_2020_imputed[target_var], y_pred)
        accuracy = round(accuracy * 100, 0)
        tmp_result.append(accuracy)
        
    
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [14]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [15]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

In [16]:
# 각 년도에 대한 변수의 평균과 분산 계산
df_2018 = result_df[result_df['year'] == 2018]
df_2019 = result_df[result_df['year'] == 2019]
df_2020 = result_df[result_df['year'] == 2020]

variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']
for df in [df_2018, df_2019, df_2020]:
    year = df['year'].values[0]
    print(f"\n{year}년도 데이터")
    for var in variables:
        mean = np.mean(df[var])
        variance = math.sqrt(np.var(df[var]))
        print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")


2018년도 데이터
Post_sleep 평균: 53.68, 표준편차: 10.16
Post_sleepProblem 평균: 59.11, 표준편차: 7.74
Post_dream 평균: 69.68, 표준편차: 7.43
Post_amCondition 평균: 54.46, 표준편차: 8.59
Post_amEmotion 평균: 57.78, 표준편차: 8.79

2019년도 데이터
Post_sleep 평균: 43.05, 표준편차: 7.79
Post_sleepProblem 평균: 55.02, 표준편차: 7.73
Post_dream 평균: 73.58, 표준편차: 7.17
Post_amCondition 평균: 51.18, 표준편차: 7.58
Post_amEmotion 평균: 43.19, 표준편차: 6.93

2020년도 데이터
Post_sleep 평균: 48.69, 표준편차: 5.59
Post_sleepProblem 평균: 59.40, 표준편차: 4.02
Post_dream 평균: 67.33, 표준편차: 5.97
Post_amCondition 평균: 47.70, 표준편차: 6.94
Post_amEmotion 평균: 49.23, 표준편차: 5.92


In [17]:
def test(x):
    global IterativeImputer
    global result_df
    
    # 2018데이터 준비
    user_ids = df2018["userId"].unique()
    test_user_id = [random.choice(user_ids)]
    user_ids = user_ids[user_ids != test_user_id[0]]

    train_2018 = df2018[df2018["userId"].isin(user_ids)]
    test_2018 = df2018[df2018["userId"].isin(test_user_id)]
    
    train_2018.drop(columns=['userId'], inplace=True)
    test_2018.drop(columns=['userId'], inplace=True)
    
    # 2019데이터 준비
    user_ids = df2019["userId"].unique()
    test_user_id = [random.choice(user_ids)]
    user_ids = user_ids[user_ids != test_user_id[0]]

    train_2019 = df2019[df2019["userId"].isin(user_ids)]
    test_2019 = df2019[df2019["userId"].isin(test_user_id)]
    
    train_2019.drop(columns=['userId'], inplace=True)
    test_2019.drop(columns=['userId'], inplace=True)

    # 2020데이터 준비
    user_ids = df2020["userId"].unique()
    test_user_id = [random.choice(user_ids)]
    user_ids = user_ids[user_ids != test_user_id[0]]

    train_2020 = df2020[df2020["userId"].isin(user_ids)]
    test_2020 = df2020[df2020["userId"].isin(test_user_id)]
    
    train_2020.drop(columns=['userId'], inplace=True)
    test_2020.drop(columns=['userId'], inplace=True)
    
    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_2018[col].fillna(train_2018[col].mode()[0], inplace=True)
            test_2018[col].fillna(train_2018[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            train_2019[col].fillna(train_2019[col].mode()[0], inplace=True)
            test_2019[col].fillna(train_2019[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            train_2020[col].fillna(train_2020[col].mode()[0], inplace=True)
            test_2020[col].fillna(train_2020[col].mode()[0], inplace=True)
        except:
            continue           
    
    # Mice 모델로 결측치를 대체합니다.
    # 2018
    mice_imputer = IterativeImputer()
    try:
        train_2018_imputed = mice_imputer.fit_transform(train_2018.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_2018_imputed = pd.DataFrame(train_2018_imputed, columns=train_2018.columns)
    train_2018_imputed = train_2018_imputed.where(train_2018_imputed >= train_2018.min(), train_2018.min(), axis=1)
    train_2018_imputed = train_2018_imputed.where(train_2018_imputed <= train_2018.max(), train_2018.max(), axis=1)
    
    try:
        test_2018_imputed = mice_imputer.transform(test_2018.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_2018_imputed = pd.DataFrame(test_2018_imputed, columns=test_2018.columns)
    test_2018_imputed = test_2018_imputed.where(test_2018_imputed >= train_2018.min(), train_2018.min(), axis=1)
    test_2018_imputed = test_2018_imputed.where(test_2018_imputed <= train_2018.max(), train_2018.max(), axis=1)
    
    # 2019
    mice_imputer = IterativeImputer()
    try:
        train_2019_imputed = mice_imputer.fit_transform(train_2019.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_2019_imputed = pd.DataFrame(train_2019_imputed, columns=train_2019.columns)
    train_2019_imputed = train_2019_imputed.where(train_2019_imputed >= train_2019.min(), train_2019.min(), axis=1)
    train_2019_imputed = train_2019_imputed.where(train_2019_imputed <= train_2019.max(), train_2019.max(), axis=1)

    try:
        test_2019_imputed = mice_imputer.transform(test_2019.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_2019_imputed = pd.DataFrame(test_2019_imputed, columns=test_2019.columns)
    test_2019_imputed = test_2019_imputed.where(test_2019_imputed >= train_2019.min(), train_2019.min(), axis=1)
    test_2019_imputed = test_2019_imputed.where(test_2019_imputed <= train_2019.max(), train_2019.max(), axis=1)
    
    # 2020
    mice_imputer = IterativeImputer()
    try:
        train_2020_imputed = mice_imputer.fit_transform(train_2020.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_2020_imputed = pd.DataFrame(train_2020_imputed, columns=train_2020.columns)
    train_2020_imputed = train_2020_imputed.where(train_2020_imputed >= train_2020.min(), train_2020.min(), axis=1)
    train_2020_imputed = train_2020_imputed.where(train_2020_imputed <= train_2020.max(), train_2020.max(), axis=1)

    try:
        test_2020_imputed = mice_imputer.transform(test_2020.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_2020_imputed = pd.DataFrame(test_2020_imputed, columns=test_2020.columns)
    test_2020_imputed = test_2020_imputed.where(test_2020_imputed >= train_2020.min(), train_2020.min(), axis=1)
    test_2020_imputed = test_2020_imputed.where(test_2020_imputed <= train_2020.max(), train_2020.max(), axis=1)
    
    # 2018
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care',
           'action_sleep', 'action_communitiy_interaction', 'action_work',
           'action_study', 'action_household', 'action_recreation_media', 'action_outdoor_act',
           'action_community_interaction', 'action_travel', 'action_meal',
           'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor',
           'avg_emotionPositive', 'avg_emotionTension', 'activity_0', 'activity_1',
           'activity_2', 'condition_ALONE', 'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)",
            "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)"]
    
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2018]
    for target_var in target:
        
        train_2018 = copy.deepcopy(train_2018_imputed)
        test_2018 = copy.deepcopy(test_2018_imputed)
        
        tmp_accuracy = []
        
        while len(test_2018) > 0:
            X_train = train_2018.drop(target, axis=1)
            y_train = train_2018[target_var]

            X_test = test_2018.drop(target, axis=1)
            y_test = test_2018[target_var]

            # 정규화 과정을 진행합니다.
            train_mean = X_train[numeric_cols].mean()
            train_std = X_train[numeric_cols].std() + 0.000001

            X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
            X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
            
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            y_pred = lr.predict(X_test)
        
            accuracy = accuracy_score(y_test, y_pred)
            accuracy = round(accuracy * 100, 0)
            tmp_accuracy.append(accuracy)
            
            train_2018 = train_2018.append(pd.DataFrame([test_2018.iloc[0]], columns=train_2018_imputed.columns))
            test_2018 = test_2018.iloc[1:]
        
        max_index = 0
        for i in range(1, len(tmp_accuracy)):
            if tmp_accuracy[i] > tmp_accuracy[0]:
                max_index = i
                break

        tmp_result.append(max_index)
        
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)
    
    # 2019
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care',
           'action_sleep', 'action_communitiy_interaction', 'action_work',
           'action_study', 'action_household', 'action_recreation_media', 'action_outdoor_act',
           'action_community_interaction', 'action_travel', 'action_meal',
           'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor',
           'avg_emotionPositive', 'avg_emotionTension', 'activity_0', 'activity_1',
           'activity_2', 'condition_ALONE', 'condition_NOT_ALONE', "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)"]
    
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2019]
    for target_var in target:
        
        train_2019 = copy.deepcopy(train_2019_imputed)
        test_2019 = copy.deepcopy(test_2019_imputed)
        
        tmp_accuracy = []
        
        while len(test_2019) > 0:
            X_train = train_2019.drop(target, axis=1)
            y_train = train_2019[target_var]

            X_test = test_2019.drop(target, axis=1)
            y_test = test_2019[target_var]

            # 정규화 과정을 진행합니다.
            train_mean = X_train[numeric_cols].mean()
            train_std = X_train[numeric_cols].std() + 0.000001

            X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
            X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
            
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            y_pred = lr.predict(X_test)
        
            accuracy = accuracy_score(y_test, y_pred)
            accuracy = round(accuracy * 100, 0)
            tmp_accuracy.append(accuracy)
            
            train_2019 = train_2019.append(pd.DataFrame([test_2019.iloc[0]], columns=train_2019_imputed.columns))
            test_2019 = test_2019.iloc[1:]
        
        max_index = 0
        for i in range(1, len(tmp_accuracy)):
            if tmp_accuracy[i] > tmp_accuracy[0]:
                max_index = i
                break

        tmp_result.append(max_index)
        
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

    # 2020
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care', 'action_sleep',
           'action_work', 'action_study', 'action_household',
           'action_recreation_media', 'action_care_housemem', 'action_shop',
           'action_outdoor_act', 'action_community_interaction', 'action_travel',
           'action_meal', 'place_home', 'place_workplace', 'place_outdoor',
           'place_other_indoor', 'avg_emotionPositive', 'avg_emotionTension',
           'activity_0', 'activity_1', 'activity_2', 'condition_ALONE',
           'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)",
            "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)",
            "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)",
            "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)"]

    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x, 2020]
    for target_var in target:
        
        train_2020 = copy.deepcopy(train_2020_imputed)
        test_2020 = copy.deepcopy(test_2020_imputed)
        
        tmp_accuracy = []
        
        while len(test_2020) > 0:
            X_train = train_2020.drop(target, axis=1)
            y_train = train_2020[target_var]

            X_test = test_2020.drop(target, axis=1)
            y_test = test_2020[target_var]

            # 정규화 과정을 진행합니다.
            train_mean = X_train[numeric_cols].mean()
            train_std = X_train[numeric_cols].std() + 0.000001

            X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
            X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
            
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            y_pred = lr.predict(X_test)
        
            accuracy = accuracy_score(y_test, y_pred)
            accuracy = round(accuracy * 100, 0)
            tmp_accuracy.append(accuracy)
            
            train_2020 = train_2020.append(pd.DataFrame([test_2020.iloc[0]], columns=train_2020_imputed.columns))
            test_2020 = test_2020.iloc[1:]
        
        max_index = 0
        for i in range(1, len(tmp_accuracy)):
            if tmp_accuracy[i] > tmp_accuracy[0]:
                max_index = i
                break

        tmp_result.append(max_index)
        
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [18]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [19]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

In [20]:
# 각 년도에 대한 변수의 평균과 분산 계산
df_2018 = result_df[result_df['year'] == 2018]
df_2019 = result_df[result_df['year'] == 2019]
df_2020 = result_df[result_df['year'] == 2020]

variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']
for df in [df_2018, df_2019, df_2020]:
    year = df['year'].values[0]
    print(f"\n{year}년도 데이터")
    for var in variables:
        mean = np.mean(df[var])
        variance = math.sqrt(np.var(df[var]))
        print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")


2018년도 데이터
Post_sleep 평균: 4.53, 표준편차: 4.36
Post_sleepProblem 평균: 5.13, 표준편차: 4.62
Post_dream 평균: 5.54, 표준편차: 8.17
Post_amCondition 평균: 2.12, 표준편차: 2.00
Post_amEmotion 평균: 6.29, 표준편차: 7.01

2019년도 데이터
Post_sleep 평균: 1.76, 표준편차: 2.11
Post_sleepProblem 평균: 2.27, 표준편차: 2.66
Post_dream 평균: 2.06, 표준편차: 2.44
Post_amCondition 평균: 1.37, 표준편차: 1.52
Post_amEmotion 평균: 1.68, 표준편차: 2.05

2020년도 데이터
Post_sleep 평균: 2.25, 표준편차: 2.55
Post_sleepProblem 평균: 4.82, 표준편차: 6.44
Post_dream 평균: 3.88, 표준편차: 3.97
Post_amCondition 평균: 3.07, 표준편차: 3.84
Post_amEmotion 평균: 5.32, 표준편차: 6.52
