In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np

### 1. 기존 데셋 불러오기

In [4]:
origin_df = pd.read_csv('/content/drive/MyDrive/학교/쿠다 5기/추천시스템/merge_domitory_data.csv')

In [5]:
# @title
origin_df

Unnamed: 0,user_id,domitory,age,student_id,gender,major,bedtime,clean_duration,smoke,alcohol,mbti,alarm,activity,birth,student_id.1,major.1,smoke.1
0,1,0,0.0,0,1,2,2,0,0.0,0,ENTJ,2.0,1.0,45355,24,같은과,N
1,2,0,3.0,3,1,2,2,1,0.0,1,ISFP,1.0,2.0,"02, 03, 04, 05, 06",222324,상관X,N
2,3,0,3.0,3,1,2,2,0,1.0,1,ESTJ,2.0,2.0,,,,Y
3,4,0,0.0,0,1,2,2,1,0.0,0,ISFJ,2.0,1.0,9998000102,192021,상관X,N
4,5,0,3.0,3,0,0,2,0,0.0,0,ISFJ,2.0,0.0,0102030405,2021222324,상관X,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,129,0,4.0,4,1,3,3,1,0.0,0,ISFP,1.0,,,,,
129,130,4,3.0,3,0,1,3,0,0.0,1,ISFJ,2.0,,,,,
130,131,0,2.0,3,1,2,3,2,0.0,0,INFP,0.0,,,,,
131,132,0,3.0,3,1,2,3,0,0.0,1,INTJ,1.0,,,,,


##### 데셋 원-핫 인코딩 정의

1. 각 feature 별 더미 데이터 생성 필요.
ex) 흡연 0 -> 1, gender 0 -> 1, 1 -> 0, major 2 -> 5 등
2. 이후 노이즈 추가된 데이터 병합

In [6]:
data_one_hot = {
  'domitory' : [0, 1, 2, 3, 4],       # 제 2기숙사 : 0, 우정원 : 1, 행복기숙사 : 2, 아름원 : 3, 세화원 : 4,
  'age' : [0, 1, 2, 3, 4],            # 01: 0, 02: 1, 03: 2, 04: 3, 05: 4
  'student_id' : [0, 1, 2, 3, 4],     # 20: 0, 21: 1, 22: 2, 23: 3, 24: 4
  'gender' : [0, 1],                  # M: 1, F: 0
  'major' : [0, 1, 2, 3, 4, 5],       # 인문대: 0, 사회과학: 1, 공대: 2, 자연과학대: 3, 예체능: 4, 의학 및 보건: 5
  'bedtime' : [0, 1, 2, 3, 4],        # 10~11시: 0, 11~12시: 1, 12~01시: 2, 01~02시: 3, 02시 이후: 4
  'clean_duration' : [0, 1, 2],       # 그때그때: 0, 중간: 1, 한번에: 2
  'smoke' : [0, 1],                   # Y: 1, N: 0
  'alcohol' : [0, 1, 2, 3],                        # 기존 one-hot에는 없어서 추가했음.
   #  'mbti' : ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ESFJ', 'ENFJ', 'ENTJ'], -> 어떻게 구분할지 잘 모르겠어서 일단 제외
  'alarm' : [0, 1, 2]                             # 잠만보: 0, 중간: 1, 잘들음: 2 -> 주요 특성이라 생각해서 추가했음.(결측치는 0 ~ 2 의 값 사이에서 랜덤으로 부여.)
}


In [7]:
# feature 별 최대값 확인. ex) 기숙사(dormitory) 데이터
origin_df['domitory'].max()

# 중요/수치형 컬럼 분류
columns = []
for key, value in data_one_hot.items():
  columns.append(key)

columns

['domitory',
 'age',
 'student_id',
 'gender',
 'major',
 'bedtime',
 'clean_duration',
 'smoke',
 'alcohol',
 'alarm']

In [8]:
columns = ['user_id'] + columns # user_id 값 유지 위함
new_df = pd.DataFrame(index = origin_df.index, columns = columns)

In [9]:
columns

['user_id',
 'domitory',
 'age',
 'student_id',
 'gender',
 'major',
 'bedtime',
 'clean_duration',
 'smoke',
 'alcohol',
 'alarm']

In [10]:
# origin_df: 기존의 DataFrame
# new_df: 새로운 DataFrame을 생성. origin_df와 같은 구조로 초기화.
import random

def create_new_df(df, cols):
    for column in cols:
        # 각 열의 최댓값을 구함.
        max_value = origin_df[column].max()
        # 각 값에 대해 최댓값과의 차이를 계산.
        diff = max_value - origin_df[column]

        # 노이즈를 초기화.
        noise = np.zeros(origin_df.shape[0])

        # 최댓값과 현재 값의 차이가 0보다 큰 경우에만 노이즈를 생성.
        mask = diff >= 0
        noise[mask] = np.random.uniform(0, 1, size=mask.sum()) * diff[mask]

        # 생성된 노이즈를 기존 값에 더해 새로운 DataFrame에 할당.
        new_df[column] = np.round(origin_df[column] + noise)

        # 최댓값을 초과하지 않도록 값을 조정.
        new_df[column] = np.minimum(new_df[column], max_value)

    return new_df

In [12]:
# 예시
new_df = create_new_df(origin_df, columns)
new_df2 = create_new_df(new_df, columns)
new_df3 = create_new_df(origin_df, columns)

In [13]:
new_df3

Unnamed: 0,user_id,domitory,age,student_id,gender,major,bedtime,clean_duration,smoke,alcohol,alarm
0,118.0,0.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0,2.0
1,109.0,4.0,4.0,4.0,1.0,4.0,4.0,1.0,1.0,2.0,2.0
2,60.0,3.0,4.0,3.0,1.0,2.0,3.0,0.0,1.0,2.0,2.0
3,59.0,0.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0
4,86.0,1.0,3.0,3.0,0.0,0.0,2.0,0.0,0.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
128,131.0,1.0,4.0,4.0,1.0,4.0,3.0,2.0,0.0,3.0,2.0
129,131.0,4.0,4.0,3.0,1.0,2.0,4.0,0.0,1.0,2.0,2.0
130,133.0,2.0,3.0,4.0,1.0,3.0,4.0,2.0,0.0,1.0,1.0
131,132.0,0.0,4.0,4.0,1.0,2.0,4.0,1.0,0.0,1.0,1.0


#### 랜덤한 노이즈 데이터들을 생성해서 기존 데이터셋과 concat 했음.
- 결측치들은 어떻게 채울지 논의 필요. (ex. birth, major, 등등)

In [14]:
def concating_df(origin_, new_):
    # 인덱스를 재설정하되, 새로운 열로 추가하지 않음.
    origin_.reset_index(inplace=True, drop=True)
    # 병합된 데이터프레임을 merged_df 변수에 할당.
    merged_df = pd.concat([origin_, new_], axis=0, ignore_index=True)
    if 'level_0' in merged_df.columns:
        merged_df.drop(columns=['level_0'], inplace=True)
    merged_df['user_id'] = merged_df.index
    # 병합된 데이터프레임을 반환.
    return merged_df

In [15]:
new_df4 = concating_df(origin_df, new_df)

create_new_df , concating_df 10번 반복.

In [17]:
for_merge = [] # merge 할 데이터셋 모음

for i in range(10):
  new_ = create_new_df(origin_df, columns)
  for_merge.append(new_)

In [18]:
for ds in for_merge:
  new_df = ds
  merged_df = concating_df(origin_df, new_df)
  origin_df = merged_df

In [19]:
origin_df

Unnamed: 0,user_id,domitory,age,student_id,gender,major,bedtime,clean_duration,smoke,alcohol,mbti,alarm,activity,birth,student_id.1,major.1,smoke.1
0,0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,ENTJ,2.0,1.0,45355,24,같은과,N
1,1,0.0,3.0,3.0,1.0,2.0,2.0,1.0,0.0,1.0,ISFP,1.0,2.0,"02, 03, 04, 05, 06",222324,상관X,N
2,2,0.0,3.0,3.0,1.0,2.0,2.0,0.0,1.0,1.0,ESTJ,2.0,2.0,,,,Y
3,3,0.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0,0.0,ISFJ,2.0,1.0,9998000102,192021,상관X,N
4,4,0.0,3.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,ISFJ,2.0,0.0,0102030405,2021222324,상관X,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458,1458,2.0,4.0,4.0,1.0,4.0,4.0,2.0,1.0,2.0,,2.0,,,,,
1459,1459,4.0,4.0,3.0,0.0,1.0,3.0,1.0,0.0,1.0,,2.0,,,,,
1460,1460,3.0,4.0,4.0,1.0,2.0,4.0,2.0,0.0,1.0,,1.0,,,,,
1461,1461,1.0,4.0,3.0,1.0,4.0,3.0,0.0,1.0,1.0,,2.0,,,,,
