# 데이터 준비

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import *
from sklearn.impute import *
from sklearn.metrics import *
import pandas as pd
import numpy as np
import random
import warnings
import copy
import matplotlib.pyplot as plt
import datetime
import re
import math
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('./complete_data(2020 추가).csv')

# 피처정리

In [2]:
# action 데이터의 결측은 모두 0으로 생각합니다.
for col in df.columns:
    if col.startswith('action'):
        df[col].fillna(0, inplace=True)

In [3]:
# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# 0은 0, 나머지는 1로 바꿔줍니다.
df["('Post_sleepProblem',)"] = df["('Post_sleepProblem',)"].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1})

# 4는 0, 나머지는 1로 바꿔줍니다.
df["('Post_dream',)"] = df["('Post_dream',)"].map({4:0, 1:1, 2:1, 3:1})

# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["sleep", "amCondition", "amEmotion"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# 0은 0, 나머지는 1로 바꿔줍니다.
df["sleepProblem"] = df["sleepProblem"].map({0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1})

# "('Post_dream',)"은 4는 0, 나머지는 1로 바꿔줍니다.
df["dream"] = df["dream"].map({4:0, 1:1, 2:1, 3:1})

# 1과 2를 0, 3은 1, 4와 5는 2로 바꿔줍니다.
for var in ["pmEmotion", "pmStress", "pmFatigue"]:
    df[var] = df[var].map({1:0, 2:0, 3:1, 4:2, 5:2})

# bmi 피처를 생성합니다.
df['bmi'] = df['weight'] / (df['height']/100)**2
df.drop(columns=['height', 'weight'], inplace=True)

In [4]:
# 비슷한 의미를 가진 피처를 통합합니다.
df['action_recreation_media'] = df['action_recreation_media'] + df['action_entertainment'] + df['action_hobby'] + df['action_recreation_etc']
df.drop(columns=['action_entertainment', 'action_hobby', 'action_recreation_etc'], inplace=True)

df['action_community_interaction'] = df['action_community_interaction'] + df['action_socialising']
df.drop(columns=['action_socialising'], inplace=True)

df['place_other_indoor'] = df['place_other_indoor'] + df['place_restaurant']

In [5]:
# 큰 범주 데이터만 포함시키고 Sub는 제거합니다.
df.drop(columns=['actionOption_751', 'actionOption_793', 'actionSubOption_1',
       'actionSubOption_2', 'actionSubOption_3', 'actionSubOption_4',
       'actionSubOption_5', 'conditionSub1Option_1', 'conditionSub1Option_2',
       'conditionSub1Option_3', 'conditionSub1Option_4',
       'conditionSub1Option_5'], inplace=True)

df.drop(columns=['activity_3', 'activity_4', 'activity_5', 'activity_7', 'activity_8'], inplace=True)

In [6]:
# 날짜가 주기성을 가지도록 변경합니다.

# date 칼럼을 datetime 형식으로 변환합니다.
df['date'] = pd.to_datetime(df['date'])
df["('Pre_startDt',)"] = pd.to_datetime(df["('Pre_startDt',)"])
df["('Pre_endDt',)"] = pd.to_datetime(df["('Pre_endDt',)"])

# month 칼럼에 달 정보를 저장합니다.
df['month'] = df['date'].apply(lambda x: x.month)

start_time = df["('Pre_startDt',)"].dt.hour
start_time = df["('Pre_startDt',)"].dt.minute
end_time = df["('Pre_endDt',)"].dt.hour
end_time = df["('Pre_endDt',)"].dt.minute

for i in range(len(start_time)):
    if pd.notnull(start_time[i]):
        hour = start_time[i]
        minute = start_time[i]

        # 30분 이상이면 1시간 추가, 30분 미만이면 버림
        if minute >= 30:
            hour += 1
        else:
            minute = 0

        # 시간 값이 24 이상이면 24를 뺀 나머지 값으로 변경
        hour = hour % 24

        # datetime 객체로 변환하여 저장
        start_time[i] = hour

    if pd.notnull(end_time[i]):
        hour = end_time[i]
        minute = end_time[i]

        # 30분 이상이면 1시간 추가, 30분 미만이면 버림
        if minute >= 30:
            hour += 1
        else:
            minute = 0

        # 시간 값이 24 이상이면 24를 뺀 나머지 값으로 변경
        hour = hour % 24

        # datetime 
        end_time[i] = hour
        
# sin, cos 변환
start_sin = np.sin(2 * np.pi * start_time / 24)
start_cos = np.cos(2 * np.pi * start_time / 24)

end_sin = np.sin(2 * np.pi * end_time / 24)
end_cos = np.cos(2 * np.pi * end_time / 24)

df['pre_start_sin']  = start_sin
df['pre_start_cos']  = start_cos

df['pre_end_sin']  = end_sin
df['pre_end_cos']  = end_cos

# sin, cos 변환
month_sin = np.sin(2 * np.pi * start_time / 24)
month_cos = np.cos(2 * np.pi * start_time / 24)

df['month_sin']  = month_sin
df['month_cos']  = month_cos

df = df.drop(columns=["('Pre_startDt',)", "('Pre_endDt',)", 'month'])

In [7]:
# 피처 형식을 맞춥니다.
df.loc[df['gender'] == 'F', 'gender'] = 0
df.loc[df['gender'] == 'M', 'gender'] = 1
df['gender'] = df['gender'].astype(float)
df['age'] = df['age'].astype(float)

# 데이터셋을 나눕니다

In [8]:
# 타겟 데이터가 결측인 경우는 제거합니다.
target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]

df.dropna(subset=target, inplace=True)
df.drop_duplicates(keep='last', inplace=True)
df.reset_index(inplace=True, drop=True)

In [9]:
# 사용할 피처를 선별하고, target을 정합니다.
now_common = ['userId', 'gender', 'age', 'action_personal_care', 'action_sleep',
          'action_work', 'action_study', 'action_household', 'action_recreation_media',
          'action_outdoor_act','action_community_interaction', 'action_travel', 'action_meal',
          'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor', 'avg_emotionPositive',
          'avg_emotionTension', 'activity_0', 'activity_1', 'activity_2', 'pre_start_sin', 'pre_start_cos', 'pre_end_sin', 'pre_end_cos', 'sleep', 'sleepProblem', 
          'dream', 'amCondition', 'amEmotion', 'pmEmotion', 'pmStress', 'alcohol', 'caffeine', 'condition_ALONE', 'condition_NOT_ALONE', "('Pre_wakeupcount',)"]
now_2018_2019 = ['action_communitiy_interaction', "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)", "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)"]
now_2020 = ['bmi', 'action_care_housemem', 'action_shop', "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)", "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)", "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)", "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)", 'pmFatigue']

next_common = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]

df = df[now_common + now_2018_2019 + now_2020 + next_common]

In [10]:
# 데이터셋을 2018, 2019, 2020으로 분리합니다.
df2018_2019 = df[df['userId'].str.startswith('2018')]
df2018 = df2018_2019[df2018_2019['userId'].str.contains("[a-zA-Z]{3}\d{3}$")]
df2018.dropna(axis=1, how='all', inplace=True)

df2019 = df2018_2019[~df2018_2019['userId'].str.contains("[a-zA-Z]{3}\d{3}$")]
df2019.dropna(axis=1, how='all', inplace=True)

df2020 = df[df['userId'].str.startswith('2020')]
df2020.dropna(axis=1, how='all', inplace=True)

# 향후 사용을 위해 참여 년도를 피처로 추가해둡니다.
df['pat_year_2018'] = [1] * len(df2018) + [0] * len(df2019) + [0] * len(df2020)
df['pat_year_2019'] = [0] * len(df2018) + [1] * len(df2019) + [0] * len(df2020)
df['pat_year_2020'] = [0] * len(df2018) + [0] * len(df2019) + [1] * len(df2020)

In [11]:
# 전체 데이터셋을 저장해둡니다.
total_df = copy.deepcopy(df)

In [12]:
# 범주형 피처와 수치형 피처를 구분합니다.
category = ['gender', 'sleep', 'sleepProblem', 'dream', 'amCondition', 'amEmotion', 'pmEmotion', 'pmStress', 'alcohol', 'caffeine', 'pmFatigue']
numeric = ['age', 'action_personal_care',
           'action_sleep', 'action_communitiy_interaction', 'action_work',
           'action_study', 'action_household', 'action_recreation_media',
           'action_care_housemem', 'action_shop', 'action_outdoor_act',
           'action_community_interaction', 'action_travel', 'action_meal',
           'place_home', 'place_workplace', 'place_outdoor', 'place_other_indoor',
           'avg_emotionPositive', 'avg_emotionTension', 'activity_0', 'activity_1',
           'activity_2', 'condition_ALONE', 'condition_NOT_ALONE', 'bmi']

# 데이터 탐색

In [13]:
len(df)

1340

In [14]:
df['sleep'].value_counts()

2.0    633
1.0    378
0.0    329
Name: sleep, dtype: int64

In [15]:
df["sleepProblem"].value_counts()

0.0    720
1.0    620
Name: sleepProblem, dtype: int64

In [16]:
df["dream"].value_counts()

0.0    832
1.0    508
Name: dream, dtype: int64

In [17]:
df["amCondition"].value_counts()

0.0    633
2.0    361
1.0    346
Name: amCondition, dtype: int64

In [18]:
df["amEmotion"].value_counts()

1.0    619
2.0    406
0.0    315
Name: amEmotion, dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 73 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   userId                                     1340 non-null   object 
 1   gender                                     1340 non-null   float64
 2   age                                        1340 non-null   float64
 3   action_personal_care                       1340 non-null   float64
 4   action_sleep                               1340 non-null   float64
 5   action_work                                1340 non-null   float64
 6   action_study                               1340 non-null   float64
 7   action_household                           1340 non-null   float64
 8   action_recreation_media                    1340 non-null   float64
 9   action_outdoor_act                         1340 non-null   float64
 10  action_community_interac

In [20]:
mice_imputer = IterativeImputer()

# category 데이터에서는 최빈값(mode)으로 대체합니다.
for col in category:
    try:
        df2018[col].fillna(df2018[col].mode()[0], inplace=True)
    except:
        continue

df2018_imputed = mice_imputer.fit_transform(df2018.drop(columns=['userId']).values)
df2018_imputed = pd.DataFrame(df2018_imputed, columns=df2018.columns[1:])

In [21]:
mice_imputer = IterativeImputer()

# category 데이터에서는 최빈값(mode)으로 대체합니다.
for col in category:
    try:
        df2019[col].fillna(df2019[col].mode()[0], inplace=True)
    except:
        continue

df2019_imputed = mice_imputer.fit_transform(df2019.drop(columns=['userId']).values)
df2019_imputed = pd.DataFrame(df2019_imputed, columns=df2019.columns[1:])

In [22]:
mice_imputer = IterativeImputer()

# category 데이터에서는 최빈값(mode)으로 대체합니다.
for col in category:
    try:
        df2020[col].fillna(df2020[col].mode()[0], inplace=True)
    except:
        continue

df2020_imputed = mice_imputer.fit_transform(df2020.drop(columns=['userId']).values)
df2020_imputed = pd.DataFrame(df2020_imputed, columns=df2020.columns[1:])

In [24]:
import scipy.stats as stats

for col in category:
    try:
        # bmi 카이제곱검정
        tmp_2018 = df2018_imputed[col].value_counts().sort_index().values
        tmp_2019 = df2019_imputed[col].value_counts().sort_index().values
        tmp_2020 = df2020_imputed[col].value_counts().sort_index().values

        _, pvalue_2018_2019, _, _ = stats.chi2_contingency([tmp_2018, tmp_2019])
        _, pvalue_2019_2020, _, _ = stats.chi2_contingency([tmp_2019, tmp_2020])
        _, pvalue_2018_2020, _, _ = stats.chi2_contingency([tmp_2018, tmp_2020])
        print(f"{col} 카이제곱검정 2018-2019 p-value:", pvalue_2018_2019)
        print(f"{col} 카이제곱검정 2019-2020 p-value:", pvalue_2019_2020)
        print(f"{col} 카이제곱검정 2018-2020 p-value:", pvalue_2018_2020)

        # 각 집단의 비율 계산
        total_2018 = len(df2018_imputed)
        total_2019 = len(df2019_imputed)
        total_2020 = len(df2020_imputed)

        tmp_ratio_2018 = tmp_2018 / total_2018
        tmp_ratio_2019 = tmp_2019 / total_2019
        tmp_ratio_2020 = tmp_2020 / total_2020

        print(f"{col} 2018 비율:", tmp_ratio_2018)
        print(f"{col} 2019 비율:", tmp_ratio_2019)
        print(f"{col} 2020 비율:", tmp_ratio_2020)
        print()
    except:
        # 각 집단의 비율 계산
        total_2018 = len(df2018_imputed)
        total_2019 = len(df2019_imputed)
        total_2020 = len(df2020_imputed)
        
        if col in df2018_imputed.columns:
            tmp_2018 = df2018_imputed[col].value_counts().sort_index().values
            tmp_ratio_2018 = tmp_2018 / total_2018
            print(f"{col} 2018 비율:", tmp_ratio_2018)
        
        if col in df2019_imputed.columns:
            tmp_2019 = df2019_imputed[col].value_counts().sort_index().values
            tmp_ratio_2019 = tmp_2019 / total_2019
            print(f"{col} 2019 비율:", tmp_ratio_2019)
        
        if col in df2020_imputed.columns:
            tmp_2020 = df2020_imputed[col].value_counts().sort_index().values
            tmp_ratio_2020 = tmp_2020 / total_2020
            print(f"{col} 2020 비율:", tmp_ratio_2020)
    print()

gender 카이제곱검정 2018-2019 p-value: 1.4382403520487515e-06
gender 카이제곱검정 2019-2020 p-value: 4.30957067118653e-06
gender 카이제곱검정 2018-2020 p-value: 0.5128295691284182
gender 2018 비율: [0.59803922 0.40196078]
gender 2019 비율: [0.41254125 0.58745875]
gender 2020 비율: [0.57551669 0.42448331]


sleep 카이제곱검정 2018-2019 p-value: 2.4075578592946976e-12
sleep 카이제곱검정 2019-2020 p-value: 0.00012642401795358234
sleep 카이제곱검정 2018-2020 p-value: 3.2238591799647595e-06
sleep 2018 비율: [0.18627451 0.20588235 0.60784314]
sleep 2019 비율: [0.35313531 0.31353135 0.33333333]
sleep 2020 비율: [0.23211447 0.3163752  0.45151033]


sleepProblem 카이제곱검정 2018-2019 p-value: 0.000811409202373101
sleepProblem 카이제곱검정 2019-2020 p-value: 0.08656160530805407
sleepProblem 카이제곱검정 2018-2020 p-value: 0.03947308199127261
sleepProblem 2018 비율: [0.59803922 0.40196078]
sleepProblem 2019 비율: [0.46864686 0.53135314]
sleepProblem 2020 비율: [0.53100159 0.46899841]


dream 카이제곱검정 2018-2019 p-value: 0.24055947903093644
dream 카이제곱검정 2019-2020 p-valu

In [25]:
for col in numeric:
    try:
        # tmp t-test
        tmp_2018 = df2018_imputed[col]
        tmp_2019 = df2019_imputed[col]
        tmp_2020 = df2020_imputed[col]

        _, pvalue_2018_2020 = stats.ttest_ind(tmp_2018, tmp_2020)
        _, pvalue_2018_2019 = stats.ttest_ind(tmp_2018, tmp_2019)
        _, pvalue_2019_2020 = stats.ttest_ind(tmp_2019, tmp_2020)
        print(f"{col} t-test p-value (2018 vs 2020):", pvalue_2018_2020)
        print(f"{col} t-test p-value (2018 vs 2019):", pvalue_2018_2019)
        print(f"{col} t-test p-value (2019 vs 2020):", pvalue_2019_2020)

        # 각 집단의 평균과 95% 신뢰구간 계산
        mean_2018 = tmp_2018.mean()
        mean_2019 = tmp_2019.mean()
        mean_2020 = tmp_2020.mean()

        std_2018 = tmp_2018.std()
        std_2019 = tmp_2019.std()
        std_2020 = tmp_2020.std()

        n_2018 = len(tmp_2018)
        n_2019 = len(tmp_2019)
        n_2020 = len(tmp_2020)

        se_2018 = std_2018 / (n_2018**0.5)
        se_2019 = std_2019 / (n_2019**0.5)
        se_2020 = std_2020 / (n_2020**0.5)

        ci_2018 = stats.t.interval(0.95, n_2018-1, loc=mean_2018, scale=se_2018)
        ci_2019 = stats.t.interval(0.95, n_2019-1, loc=mean_2019, scale=se_2019)
        ci_2020 = stats.t.interval(0.95, n_2020-1, loc=mean_2020, scale=se_2020)

        print(f"{col} 2018 평균:", mean_2018)
        print(f"{col} 2019 평균:", mean_2019)
        print(f"{col} 2020 평균:", mean_2020)
        print(f"{col} 2018 95% 신뢰구간:", ci_2018)
        print(f"{col} 2019 95% 신뢰구간:", ci_2019)
        print(f"{col} 2020 95% 신뢰구간:", ci_2020)
        print()
    except:
        # tmp t-test
        tmp_2018 = df2018_imputed[col]
        tmp_2020 = df2020_imputed[col]

        _, pvalue_2018_2020 = stats.ttest_ind(tmp_2018, tmp_2020)
        print(f"{col} t-test p-value (2018 vs 2020):", pvalue_2018_2020)

        # 각 집단의 평균과 95% 신뢰구간 계산
        mean_2018 = tmp_2018.mean()
        mean_2020 = tmp_2020.mean()

        std_2018 = tmp_2018.std()
        std_2020 = tmp_2020.std()

        n_2018 = len(tmp_2018)
        n_2020 = len(tmp_2020)

        se_2018 = std_2018 / (n_2018**0.5)
        se_2020 = std_2020 / (n_2020**0.5)

        ci_2018 = stats.t.interval(0.95, n_2018-1, loc=mean_2018, scale=se_2018)
        ci_2020 = stats.t.interval(0.95, n_2020-1, loc=mean_2020, scale=se_2020)

        print(f"{col} 2018 평균:", mean_2018)
        print(f"{col} 2020 평균:", mean_2020)
        print(f"{col} 2018 95% 신뢰구간:", ci_2018)
        print(f"{col} 2020 95% 신뢰구간:", ci_2020)
        print()

age t-test p-value (2018 vs 2020): 6.0297311298157366e-148
age t-test p-value (2018 vs 2019): 9.303926207734801e-55
age t-test p-value (2019 vs 2020): 0.06373858313179374
age 2018 평균: 22.110294117647058
age 2019 평균: 28.782178217821784
age 2020 평균: 28.125596184419713
age 2018 95% 신뢰구간: (21.87044304855973, 22.350145186734387)
age 2019 95% 신뢰구간: (27.947631179145695, 29.616725256497872)
age 2020 95% 신뢰구간: (27.857961546116755, 28.39323082272267)

action_personal_care t-test p-value (2018 vs 2020): 5.7232120222387805e-09
action_personal_care t-test p-value (2018 vs 2019): 0.14993927045171102
action_personal_care t-test p-value (2019 vs 2020): 2.8873876839319227e-12
action_personal_care 2018 평균: 21.350490196078432
action_personal_care 2019 평균: 17.247524752475247
action_personal_care 2020 평균: 35.209856915739266
action_personal_care 2018 95% 신뢰구간: (17.61395439584554, 25.087025996311326)
action_personal_care 2019 95% 신뢰구간: (13.138109184358918, 21.356940320591576)
action_personal_care 2020 95% 신뢰

# 단순통합(최대)

In [37]:
# 데이터를 균일하게 나누기 위해 year 컬럼을 생성합니다.
year = []
for id in total_df['userId']:
    if id in df2018['userId'].unique(): year.append('2018')
    elif id in df2019['userId'].unique(): year.append('2019')
    elif id in df2020['userId'].unique(): year.append('2020')
total_df['year'] = year

In [35]:
def test(x):
    global IterativeImputer
    global result_df
    
    # 데이터 준비
    user_ids = total_df["userId"].unique()
    user_years = total_df.groupby('userId')['year'].first()


    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x, stratify=user_years)
    train_total = total_df[total_df["userId"].isin(train_user_ids)]
    test_total = total_df[total_df["userId"].isin(test_user_ids)]
    
    train_total.drop(columns=['userId', 'year'], inplace=True)
    test_total.drop(columns=['userId', 'year'], inplace=True)

    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            test_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue

    # Mice 모델로 결측치를 대체합니다.
    mice_imputer = IterativeImputer()
    try:
        train_total_imputed = mice_imputer.fit_transform(train_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_total_imputed = pd.DataFrame(train_total_imputed, columns=train_total.columns)
    train_total_imputed = train_total_imputed.where(train_total_imputed >= train_total.min(), train_total.min(), axis=1)
    train_total_imputed = train_total_imputed.where(train_total_imputed <= train_total.max(), train_total.max(), axis=1)

    try:
        test_total_imputed = mice_imputer.transform(test_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_total_imputed = pd.DataFrame(test_total_imputed, columns=test_total.columns)
    test_total_imputed = test_total_imputed.where(test_total_imputed >= train_total.min(), train_total.min(), axis=1)
    test_total_imputed = test_total_imputed.where(test_total_imputed <= train_total.max(), train_total.max(), axis=1)
    
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care', 'action_sleep',
           'action_work', 'action_study', 'action_household',
           'action_recreation_media', 'action_care_housemem', 'action_shop',
           'action_outdoor_act', 'action_community_interaction', 'action_travel',
           'action_meal', 'place_home', 'place_workplace', 'place_outdoor',
           'place_other_indoor', 'avg_emotionPositive', 'avg_emotionTension',
           'activity_0', 'activity_1', 'activity_2', 'condition_ALONE',
            "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)", "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)",
           'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)",
            "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)",
            "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)",
            "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)"]
  
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]

        X_test = test_total_imputed.drop(target, axis=1)
        y_test = test_total_imputed[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std()
        if train_std == 0: train_std += 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [36]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=["seed", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [39]:
# 10개의 시드에 대해 테스트합니다.
for i in range(10):
    test(i)

In [42]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 46.55, 표준편차: 3.23
Post_sleepProblem 평균: 55.82, 표준편차: 3.81
Post_dream 평균: 63.91, 표준편차: 3.50
Post_amCondition 평균: 47.45, 표준편차: 4.21
Post_amEmotion 평균: 48.27, 표준편차: 3.41


In [43]:
# 결과를 저장할 파데이터 프레임을 초기화합니다.
result_df = pd.DataFrame(columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [44]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

In [45]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 45.77, 표준편차: 4.24
Post_sleepProblem 평균: 56.19, 표준편차: 3.80
Post_dream 평균: 63.10, 표준편차: 4.84
Post_amCondition 평균: 45.98, 표준편차: 4.80
Post_amEmotion 평균: 47.93, 표준편차: 4.59


# 단순통합(최소)

In [59]:
# 컬럼들의 교집합을 구하기 위해 df를 복사합니다.
total_df_copy = copy.deepcopy(total_df)

In [64]:
control = list(set(df2018_imputed.columns) & set(df2019_imputed.columns) & set(df2020_imputed.columns))
control.append('userId')
control.append('year')

In [65]:
def test(x):
    global IterativeImputer
    global result_df
    
    # 데이터 준비
    total_df = copy.deepcopy(total_df_copy)
    total_df = total_df[control]
    user_ids = total_df["userId"].unique()
    user_years = total_df.groupby('userId')['year'].first()

    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x, stratify=user_years)
    train_total = total_df[total_df["userId"].isin(train_user_ids)]
    test_total = total_df[total_df["userId"].isin(test_user_ids)]
    
    train_total.drop(columns=['userId', 'year'], inplace=True)
    test_total.drop(columns=['userId', 'year'], inplace=True)

    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            test_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue

    # Mice 모델로 결측치를 대체합니다.
    mice_imputer = IterativeImputer()
    try:
        train_total_imputed = mice_imputer.fit_transform(train_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_total_imputed = pd.DataFrame(train_total_imputed, columns=train_total.columns)
    train_total_imputed = train_total_imputed.where(train_total_imputed >= train_total.min(), train_total.min(), axis=1)
    train_total_imputed = train_total_imputed.where(train_total_imputed <= train_total.max(), train_total.max(), axis=1)

    try:
        test_total_imputed = mice_imputer.transform(test_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_total_imputed = pd.DataFrame(test_total_imputed, columns=test_total.columns)
    test_total_imputed = test_total_imputed.where(test_total_imputed >= train_total.min(), train_total.min(), axis=1)
    test_total_imputed = test_total_imputed.where(test_total_imputed <= train_total.max(), train_total.max(), axis=1)
    
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols_tmp = ['action_personal_care', 'avg_emotionTension'
                     'action_sleep', 'place_other_indoor', 'action_recreation_media', 'place_outdoor',
                     'action_meal', 'place_workplace', 'action_community_interaction', 'activity_0',
                     'condition_NOT_ALONE', 'condition_ALONE', 'action_travel', 'avg_emotionPositive', 'activity_2',
                     'place_home', 'action_study', 'age', 'action_household', 'action_outdoor_act', 'action_work', 'activity_1']
    numeric_cols = []
    for cols_tmp in numeric_cols_tmp:
        if cols_tmp in control: numeric_cols.append(cols_tmp)
        
    # target마다 학습과 평가를 실시합니다.
    tmp_result = [x]
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var]

        X_test = test_total_imputed.drop(target, axis=1)
        y_test = test_total_imputed[target_var]

        # 정규화 과정을 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std()
        if train_std == 0: train_std += 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))
            
    tmp_df = pd.DataFrame([tmp_result], columns=["seed", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [66]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=["seed", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [67]:
# 10개의 시드에 대해 테스트합니다.
for i in range(10):
    test(i)

In [68]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 45.40, 표준편차: 4.74
Post_sleepProblem 평균: 57.70, 표준편차: 3.58
Post_dream 평균: 68.10, 표준편차: 3.42
Post_amCondition 평균: 45.90, 표준편차: 2.26
Post_amEmotion 평균: 48.10, 표준편차: 4.37


In [69]:
# 결과를 저장할 파데이터 프레임을 초기화합니다.
result_df = pd.DataFrame(columns=["seed", "year", "Post_sleep", "Post_sleepProblem", "Post_dream", "Post_amCondition", "Post_amEmotion"])

In [70]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

In [71]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_sleepProblem', 'Post_dream', 'Post_amCondition', 'Post_amEmotion']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 46.71, 표준편차: 4.89
Post_sleepProblem 평균: 59.32, 표준편차: 3.66
Post_dream 평균: 66.81, 표준편차: 4.71
Post_amCondition 평균: 45.83, 표준편차: 4.94
Post_amEmotion 평균: 48.36, 표준편차: 4.79


# 데이터 통합 준비

In [38]:
def stack(x):
    stack_train1 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m1_3", "m2_1", "m2_2", "m2_3", "m3_1", "m3_2", "m3_3", 'label'])
    stack_test1 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m1_3", "m2_1", "m2_2", "m2_3", "m3_1", "m3_2", "m3_3", 'label'])

    stack_train2 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m2_1", "m2_2", "m3_1", "m3_2", 'label'])
    stack_test2 = pd.DataFrame(columns=['seed', 'target_var', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', "m1_1", "m1_2", "m2_1", "m2_2", "m3_1", "m3_2", 'label'])

    # 데이터 준비
    user_ids = total_df["userId"].unique()
    user_years = total_df.groupby('userId')['year'].first()


    train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.2, random_state=x, stratify=user_years)
    train_total = total_df[total_df["userId"].isin(train_user_ids)]
    test_total = total_df[total_df["userId"].isin(test_user_ids)]

    train_pat = train_total[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].values
    train_total.drop(columns=['userId', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', 'year'], inplace=True)
    test_pat = test_total[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].values
    test_total.drop(columns=['userId', 'pat_year_2018', 'pat_year_2019', 'pat_year_2020', 'year'], inplace=True)
    
    # 범주형 변수는 먼저 최빈값으로 결측치를 대체합니다.
    for col in category:
        try:
            train_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue
            
    for col in category:
        try:
            test_total[col].fillna(train_total[col].mode()[0], inplace=True)
        except:
            continue

    # Mice 모델로 결측치를 대체합니다.
    mice_imputer = IterativeImputer()
    try:
        train_total_imputed = mice_imputer.fit_transform(train_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    train_total_imputed = pd.DataFrame(train_total_imputed, columns=train_total.columns)
    train_total_imputed = train_total_imputed.where(train_total_imputed >= train_total.min(), train_total.min(), axis=1)
    train_total_imputed = train_total_imputed.where(train_total_imputed <= train_total.max(), train_total.max(), axis=1)
    train_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']] = train_pat

    try:
        test_total_imputed = mice_imputer.transform(test_total.values)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    test_total_imputed = pd.DataFrame(test_total_imputed, columns=test_total.columns)
    test_total_imputed = test_total_imputed.where(test_total_imputed >= train_total.min(), train_total.min(), axis=1)
    test_total_imputed = test_total_imputed.where(test_total_imputed <= train_total.max(), train_total.max(), axis=1)
    test_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']] = test_pat
    
    # target과 numeric_cols를 구분하여 사용 피처를 확정합니다.
    target = ["('Post_sleep',)", "('Post_sleepProblem',)", "('Post_dream',)", "('Post_amCondition',)", "('Post_amEmotion',)"]
    numeric_cols = ['age', 'action_personal_care', 'action_sleep',
       'action_work', 'action_study', 'action_household',
       'action_recreation_media', 'action_care_housemem', 'action_shop',
       'action_outdoor_act', 'action_community_interaction', 'action_travel',
       'action_meal', 'place_home', 'place_workplace', 'place_outdoor',
       'place_other_indoor', 'avg_emotionPositive', 'avg_emotionTension',
       'activity_0', 'activity_1', 'activity_2', 'condition_ALONE',
        "('Pre_total_sleep_time',)", "('Pre_time_in_bed',)", "('Pre_waso',)", "('Pre_aal',)", "('Pre_movement_index',)", "('Pre_fragmentation_index',)", "('Pre_sleep_frag_index',)",
       'condition_NOT_ALONE', 'bmi', "('Pre_wakeupcount',)", "('Pre_wakeupduration',)", "('Pre_lightsleepduration',)",
        "('Pre_deepsleepduration',)", "('Pre_durationtosleep',)", "('Pre_remsleepduration',)", "('Pre_durationtowakeup',)",
        "('Pre_hr_average',)", "('Pre_hr_min',)", "('Pre_hr_max',)", "('Pre_rr_average',)", "('Pre_rr_min',)",
        "('Pre_rr_max',)", "('Pre_breathing_disturbances_intensity',)", "('Pre_snoring',)", "('Pre_snoringepisodecount',)"]
    
    # target 변수에 대해 각각 lr 모델을 학습시키고 stack을 쌓습니다.
    for target_var in target:
        X_train = train_total_imputed.drop(target, axis=1)
        y_train = train_total_imputed[target_var].values
        
        X_train_2018 = train_total_imputed[train_total_imputed['pat_year_2018']==1].drop(target, axis=1)
        train_mean = X_train_2018[numeric_cols].mean()
        train_std = X_train_2018[numeric_cols].std() + 0.000001
        X_train_2018[numeric_cols] = (X_train_2018[numeric_cols] - train_mean) / train_std
        y_train_2018 = train_total_imputed[train_total_imputed['pat_year_2018']==1][target_var].values
        
        X_train_2019 = train_total_imputed[train_total_imputed['pat_year_2019']==1].drop(target, axis=1)
        train_mean = X_train_2019[numeric_cols].mean()
        train_std = X_train_2019[numeric_cols].std() + 0.000001
        X_train_2019[numeric_cols] = (X_train_2019[numeric_cols] - train_mean) / train_std
        y_train_2019 = train_total_imputed[train_total_imputed['pat_year_2019']==1][target_var].values
        
        X_train_2020 = train_total_imputed[train_total_imputed['pat_year_2020']==1].drop(target, axis=1)
        train_mean = X_train_2020[numeric_cols].mean()
        train_std = X_train_2020[numeric_cols].std() + 0.000001
        X_train_2020[numeric_cols] = (X_train_2020[numeric_cols] - train_mean) / train_std
        y_train_2020 = train_total_imputed[train_total_imputed['pat_year_2020']==1][target_var].values
        
        X_test = test_total_imputed.drop(target, axis=1)
        y_test = test_total_imputed[target_var].values

        # 데이터 정규화를 진행합니다.
        train_mean = X_train[numeric_cols].mean()
        train_std = X_train[numeric_cols].std() + 0.000001

        X_train[numeric_cols] = (X_train[numeric_cols] - train_mean) / train_std
        X_test[numeric_cols] = (X_test[numeric_cols] - train_mean) / train_std
        
        # 예측이 3개로 이루어지는 경우와 2개로 나누어지는 경우를 나눕니다
        if target_var in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
            lr1 = LogisticRegression()
            lr1.fit(X_train_2018, y_train_2018)
            
            lr4 = LogisticRegression()
            lr4.fit(X_train_2019, y_train_2019) 
            
            lr7 = LogisticRegression()
            lr7.fit(X_train_2020, y_train_2020)   
            
            X_train_stack1 = lr1.predict_proba(X_train[lr1.feature_names_in_.tolist()])
            X_train_stack2 = lr4.predict_proba(X_train[lr4.feature_names_in_.tolist()])
            X_train_stack3 = lr7.predict_proba(X_train[lr7.feature_names_in_.tolist()])
            
            X_test_stack1 = lr1.predict_proba(X_test[lr1.feature_names_in_.tolist()])
            X_test_stack2 = lr4.predict_proba(X_test[lr4.feature_names_in_.tolist()])
            X_test_stack3 = lr7.predict_proba(X_test[lr7.feature_names_in_.tolist()])
            
            X_train_stack_tmp = np.concatenate(([[x, target_var]] * len(X_train_stack1), train_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_train_stack1, X_train_stack2, X_train_stack3, y_train.reshape(len(X_train_stack1), -1)), axis=1)
            X_train_stack_tmp = pd.DataFrame(X_train_stack_tmp, columns=stack_train1.columns)
            stack_train1 = pd.concat([stack_train1, X_train_stack_tmp], axis=0, ignore_index = True)
             
            X_test_stack_tmp = np.concatenate(([[x, target_var]] * len(X_test_stack1), test_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_test_stack1, X_test_stack2, X_test_stack3, y_test.reshape(len(X_test_stack1), -1)), axis=1)
            X_test_stack_tmp = pd.DataFrame(X_test_stack_tmp, columns=stack_test1.columns)
            stack_test1 = pd.concat([stack_test1, X_test_stack_tmp], axis=0, ignore_index = True)
            
        elif target_var in ["('Post_sleepProblem',)", "('Post_dream',)"]:
            lr2 = LogisticRegression()
            lr2.fit(X_train_2018, y_train_2018)
            
            lr5 = LogisticRegression()
            lr5.fit(X_train_2019, y_train_2019) 
            
            lr8 = LogisticRegression()
            lr8.fit(X_train_2020, y_train_2020)       
        
            X_train_stack1 = lr2.predict_proba(X_train[lr2.feature_names_in_.tolist()])
            X_train_stack2 = lr5.predict_proba(X_train[lr5.feature_names_in_.tolist()])
            X_train_stack3 = lr8.predict_proba(X_train[lr8.feature_names_in_.tolist()])

            X_test_stack1 = lr2.predict_proba(X_test[lr2.feature_names_in_.tolist()])
            X_test_stack2 = lr5.predict_proba(X_test[lr5.feature_names_in_.tolist()])
            X_test_stack3 = lr8.predict_proba(X_test[lr8.feature_names_in_.tolist()])
            
            X_train_stack_tmp = np.concatenate(([[x, target_var]] * len(X_train_stack1), train_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_train_stack1, X_train_stack2, X_train_stack3, y_train.reshape(len(X_train_stack1), -1)), axis=1)
            X_train_stack_tmp = pd.DataFrame(X_train_stack_tmp, columns=stack_train2.columns)
            stack_train2 = pd.concat([stack_train2, X_train_stack_tmp], axis=0, ignore_index = True)
                
            X_test_stack_tmp = np.concatenate(([[x, target_var]] * len(X_test_stack1), test_total_imputed[['pat_year_2018', 'pat_year_2019', 'pat_year_2020']].reset_index(drop=True), X_test_stack1, X_test_stack2, X_test_stack3, y_test.reshape(len(X_test_stack1), -1)), axis=1)
            X_test_stack_tmp = pd.DataFrame(X_test_stack_tmp, columns=stack_test2.columns)
            stack_test2 = pd.concat([stack_test2, X_test_stack_tmp], axis=0, ignore_index = True)
        
    return stack_train1, stack_test1, stack_train2, stack_test2

# 데이터 통합1: 보팅

In [39]:
def test(x):
    global result_df

    try:
        stack_train1, stack_test1, stack_train2, stack_test2 = stack(x)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    
    tmp_result = [x]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        stack_test1_tmp = stack_test1[stack_test1['target_var']==col].reset_index(drop=True)

        df1 = stack_test1_tmp.iloc[:,[5, 8, 11]].astype(float).mean(axis=1)
        df2 = stack_test1_tmp.iloc[:,[6, 9, 12]].astype(float).mean(axis=1)
        df3 = stack_test1_tmp.iloc[:,[7, 10, 13]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(stack_test1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        stack_test2_tmp = stack_test2[stack_test2['target_var']==col].reset_index(drop=True)
        
        df1 = stack_test2_tmp.iloc[:,[5, 7, 9]].astype(float).mean(axis=1)
        df2 = stack_test2_tmp.iloc[:,[6, 8, 10]].astype(float).mean(axis=1)

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(stack_test2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))
    
    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [43]:
# 결과를 저장할 파데이터 프레임을 초기화합니다.
result_df = pd.DataFrame(columns=variables = ['seed', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [44]:
# 10개의 시드에 대해 테스트합니다.
for i in range(10):
    test(i)

In [45]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 43.60, 표준편차: 2.84
Post_sleepProblem 평균: 45.70, 표준편차: 5.83
Post_dream 평균: 45.00, 표준편차: 4.52
Post_amCondition 평균: 56.00, 표준편차: 3.19
Post_amEmotion 평균: 62.60, 표준편차: 2.46


In [46]:
# 결과를 저장할 파데이터 프레임을 초기화합니다.
result_df = pd.DataFrame(columns=variables = ['seed', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [47]:
# 300개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

101에서 오류가 발생했습니다.
3 101에서 오류가 발생했습니다.


In [48]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 43.17, 표준편차: 4.15
Post_sleepProblem 평균: 43.53, 표준편차: 5.03
Post_dream 평균: 44.10, 표준편차: 4.34
Post_amCondition 평균: 56.30, 표준편차: 3.84
Post_amEmotion 평균: 61.85, 표준편차: 5.52


# 데이터 통합2: 가중치 보팅 

In [224]:
def test(x, m1, m2, m3):
    global result_df
    
    try:
        stack_train1, stack_test1, stack_train2, stack_test2 = stack(x)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    
    tmp_result = [x]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        stack_test1_tmp = stack_test1[stack_test1['target_var']==col].reset_index(drop=True)
        
        tmp2018 = stack_test1_tmp[stack_test1_tmp['pat_year_2018'].astype('int') == 1]
        tmp2019 = stack_test1_tmp[stack_test1_tmp['pat_year_2019'].astype('int') == 1]
        tmp2020 = stack_test1_tmp[stack_test1_tmp['pat_year_2020'].astype('int') == 1]
        
        
        tmp2018[['m1_1', 'm1_2', 'm1_3']] = tmp2018[['m1_1', 'm1_2', 'm1_3']].astype('float')
        tmp2019[['m2_1', 'm2_2', 'm2_3']] = tmp2019[['m2_1', 'm2_2', 'm2_3']].astype('float')
        tmp2020[['m3_1', 'm3_2', 'm3_3']] = tmp2020[['m3_1', 'm3_2', 'm3_3']].astype('float')
        
        tmp2018[['m1_1', 'm1_2', 'm1_3']] *= m1
        tmp2019[['m2_1', 'm2_2', 'm2_3']] *= m2
        tmp2020[['m3_1', 'm3_2', 'm3_3']] *= m3
        
        df1 = (tmp2018['m1_1'] + tmp2019['m2_1'] +tmp2020['m3_1']) / 3
        df2 = (tmp2018['m1_2'] + tmp2019['m2_2'] +tmp2020['m3_2']) / 3
        df3 = (tmp2018['m1_3'] + tmp2019['m2_3'] +tmp2020['m3_3']) / 3

        result = pd.DataFrame()

        result[0] = (df1 > df2) & (df1 > df3)
        result[1] = (df2 > df1) & (df2 > df3)
        result[2] = (df3 > df1) & (df3 > df2)

        result = result.idxmax(axis=1)

        accuracy = accuracy_score(stack_test1_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        stack_test2_tmp = stack_test2[stack_test2['target_var']==col].reset_index(drop=True)

        tmp2018 = stack_test2_tmp[stack_test2_tmp['pat_year_2018'].astype('int') == 1]
        tmp2019 = stack_test2_tmp[stack_test2_tmp['pat_year_2019'].astype('int') == 1]
        tmp2020 = stack_test2_tmp[stack_test2_tmp['pat_year_2020'].astype('int') == 1]

        tmp2018[['m1_1', 'm1_2']] = tmp2018[['m1_1', 'm1_2']].astype('float')
        tmp2019[['m2_1', 'm2_2']] = tmp2019[['m2_1', 'm2_2']].astype('float')
        tmp2020[['m3_1', 'm3_2']] = tmp2020[['m3_1', 'm3_2']].astype('float')
        
        tmp2018[['m1_1', 'm1_2']] *= m1
        tmp2019[['m2_1', 'm2_2']] *= m2
        tmp2020[['m3_1', 'm3_2']] *= m3
        
        df1 = (tmp2018['m1_1'] + tmp2019['m2_1'] +tmp2020['m3_1']) / 3
        df2 = (tmp2018['m1_2'] + tmp2019['m2_2'] +tmp2020['m3_2']) / 3

        result = pd.DataFrame()

        result[0] = df1 > df2
        result[1] = df2 > df1

        result = result.idxmax(axis=1)
        accuracy = accuracy_score(stack_test2_tmp.iloc[:,-1].astype(float).values, result.astype(float).values)
        tmp_result.append(round(accuracy * 100, 0))

    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [225]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns= ['seed', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [226]:
# 10개의 시드에 대해 테스트합니다.
for i in range(10):
    test(i, 1.5, 1.5, 1.5)
    
result_df.to_excel('./add_voting_10.xlsx', index=False)

In [227]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 24.20, 표준편차: 5.46
Post_amCondition 평균: 48.00, 표준편차: 6.50
Post_amEmotion 평균: 23.60, 표준편차: 4.54
Post_sleepProblem 평균: 52.30, 표준편차: 5.35
Post_dream 평균: 61.30, 표준편차: 5.06


In [228]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=['seed', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [229]:
# 10개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i, 1.5, 1.5, 1.5)
    
result_df.to_excel('./add_voting_300.xlsx', index=False)

101에서 오류가 발생했습니다.
101에서 오류가 발생했습니다.


In [230]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 23.38, 표준편차: 4.31
Post_amCondition 평균: 47.58, 표준편차: 6.39
Post_amEmotion 평균: 22.66, 표준편차: 5.10
Post_sleepProblem 평균: 54.85, 표준편차: 6.19
Post_dream 평균: 61.65, 표준편차: 7.11


# 데이터 통합3: 스태킹

In [217]:
def test(x):
    global result_df
        
    try:
        stack_train1, stack_test1, stack_train2, stack_test2 = stack(x)
    except:
        print(f'{x}에서 오류가 발생했습니다.')
        return
    
    tmp_result = [x]
    for col in ["('Post_sleep',)", "('Post_amCondition',)", "('Post_amEmotion',)"]:
        stack_train1_tmp = stack_train1[stack_train1['target_var']==col].reset_index(drop=True)
        stack_test1_tmp = stack_test1[stack_test1['target_var']==col].reset_index(drop=True)
        
        X_train = stack_train1_tmp.iloc[:,5:-1]
        y_train = stack_train1_tmp.iloc[:,-1]
        
        X_test = stack_test1_tmp.iloc[:,5:-1]
        y_test = stack_test1_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))

    for col in ["('Post_sleepProblem',)", "('Post_dream',)"]:
        stack_train2_tmp = stack_train2[stack_train2['target_var']==col].reset_index(drop=True)
        stack_test2_tmp = stack_test2[stack_test2['target_var']==col].reset_index(drop=True)
        
        X_train = stack_train2_tmp.iloc[:,5:-1]
        y_train = stack_train2_tmp.iloc[:,-1]
        
        X_test = stack_test2_tmp.iloc[:,5:-1]
        y_test = stack_test2_tmp.iloc[:,-1]

        lr = LogisticRegression()

        # early stopping을 적용합니다.
        lr.fit(X_train, y_train)

        # 모델을 사용하여 test 데이터에 대한 예측 결과를 계산합니다.
        y_pred = lr.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        tmp_result.append(round(accuracy * 100, 0))


    tmp_df = pd.DataFrame([tmp_result], columns=result_df.columns)
    result_df = pd.concat([result_df, tmp_df], axis=0)

In [218]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=['seed', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [219]:
# 10개의 시드에 대해 테스트합니다.
for i in range(10):
    test(i)

In [220]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 46.30, 표준편차: 3.26
Post_amCondition 평균: 45.20, 표준편차: 6.05
Post_amEmotion 평균: 46.10, 표준편차: 3.33
Post_sleepProblem 평균: 57.30, 표준편차: 3.03
Post_dream 평균: 64.10, 표준편차: 2.84


In [221]:
# 결과를 저장할 파데이터 프레임을 선언합니다.
result_df = pd.DataFrame(columns=['seed', 'Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream'])

In [222]:
# 10개의 시드에 대해 테스트합니다.
for i in range(300):
    test(i)

101에서 오류가 발생했습니다.
101에서 오류가 발생했습니다.


In [223]:
# 각 년도에 대한 변수의 평균과 분산 계산
variables = ['Post_sleep', 'Post_amCondition', 'Post_amEmotion', 'Post_sleepProblem', 'Post_dream']

for var in variables:
    mean = np.mean(result_df[var])
    variance = math.sqrt(np.var(result_df[var]))
    print(f"{var} 평균: {mean:.2f}, 표준편차: {variance:.2f}")

Post_sleep 평균: 44.72, 표준편차: 4.26
Post_amCondition 평균: 44.59, 표준편차: 5.11
Post_amEmotion 평균: 45.82, 표준편차: 4.46
Post_sleepProblem 평균: 57.02, 표준편차: 4.11
Post_dream 평균: 62.80, 표준편차: 5.12
