In [1]:
import os
from os.path import join
import math

import numpy as np
import pandas as pd
from sklearn import preprocessing

from time import time

In [2]:
# path 설정
ROOT_DIR_PATH = os.getcwd()
TRAIN_FEATHER_PATH = join(ROOT_DIR_PATH, 'fe_data/train_elv_time.ftr')
TEST_FEATHER_PATH = join(ROOT_DIR_PATH, 'fe_data/test_elv_time.ftr')

In [3]:
def read_feather():
    train_df = pd.read_feather(TRAIN_FEATHER_PATH)
    test_df = pd.read_feather(TEST_FEATHER_PATH)
    return train_df, test_df

In [4]:
train_df, test_df = read_feather()

### 위경도 통합

In [32]:
# gmi, dpr 두 값 평균은 나중에 test. 우선은 그냥 dpr drop
train_df = train_df.drop(['long_DPR','lat_DPR'], axis=1)
test_df = test_df.drop(['long_DPR','lat_DPR'], axis=1)

### 연월일 분리

In [27]:
# 우선은 연도 빼고 가자. 2016~8:0~2, 2019:3 으로 놓고 돌리는거 test해볼것.

def split_date_month(row):
    return row[4:6]

test_df['month'] = np.vectorize(split_date_month)(test_df.date)
train_df['month'] = np.vectorize(split_date_month)(train_df.date)

def split_date_day(row):
    return row[6:]

test_df['day'] = np.vectorize(split_date_day)(test_df.date)
train_df['day'] = np.vectorize(split_date_day)(train_df.date)

# 기존 date 제거
test_df = test_df.drop(['type'], axis=1)
train_df = train_df.drop(['type'], axis=1)

# 타입 변경
test_df['month'] = test_df['month'].astype('int8')
train_df['month'] = train_df['month'].astype('int8')
test_df['day'] = test_df['day'].astype('int8')
train_df['day'] = train_df['day'].astype('int8')

In [30]:
# 기존 date 제거
test_df = test_df.drop(['date'], axis=1)
train_df = train_df.drop(['date'], axis=1)

# 타입 변경
test_df['month'] = test_df['month'].astype('int8')
train_df['month'] = train_df['month'].astype('int8')
test_df['day'] = test_df['day'].astype('int8')
train_df['day'] = train_df['day'].astype('int8')

### 지표타입 분리

In [5]:
# train type에는 있고, test type에는 없는 타입 확인했음. (301,314, 109, 318) => 반대는 없음. 
def split_columns(row):
    return str(row)[0]

test_df['type_1'] = np.vectorize(split_columns)(test_df.type)
train_df['type_1'] = np.vectorize(split_columns)(train_df.type)

In [6]:
# type 2가 겹치는게 문제. 일단은 공통점이 있으니까 숫자가 같지 않을까란 가정으로 같게 감. 이후 다르게 해서 test필요.
def split_columns(row):
    return str(row)[1:]

test_df['type_2'] = np.vectorize(split_columns)(test_df.type)
train_df['type_2'] = np.vectorize(split_columns)(train_df.type)

# 기존 type 제거
test_df = test_df.drop(['type'], axis=1)
train_df = train_df.drop(['type'], axis=1)

# type 변경
test_df['type_1'] = test_df['type_1'].astype('int8')
test_df['type_2'] = test_df['type_2'].astype('float16').astype('int8')
train_df['type_1'] = train_df['type_1'].astype('int8')
train_df['type_2'] = train_df['type_2'].astype('float16').astype('int8')

### temp1~9 normalize

In [7]:
# 한번에 하면 memory 터져서 천천히 for문으로.
t0 = time()
for i in range(1,10):
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler.fit(train_df[[f'temp{i}']])
    
    train_df[[f'temp{i}']] = min_max_scaler.transform(train_df[[f'temp{i}']])
    test_df[[f'temp{i}']] = min_max_scaler.transform(test_df[[f'temp{i}']])
t1 = time()

print(t1-t0)

248.21009039878845


### elevation normalize

In [8]:
# train이 더 높으니까 그냥 minmax스케일링
train_df.elevation.max(), test_df.elevation.max() # 4619, 3793

min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(train_df[['elevation']])

train_df[['elevation']] = min_max_scaler.transform(train_df[['elevation']])
test_df[['elevation']] = min_max_scaler.transform(test_df[['elevation']])

### 데이터 저장

In [36]:
train_df.to_feather(join(ROOT_DIR_PATH, 'fe_data/train_mid.ftr'))
test_df.to_feather(join(ROOT_DIR_PATH, 'fe_data/test_mid.ftr'))

In [33]:
train_df.head()

Unnamed: 0,temp1,temp2,temp3,temp4,temp5,temp6,temp7,temp8,temp9,long_GMI,lat_GMI,precipitation,orbit,subset,pixel,elevation,type_1,type_2,month,day
0,0.049494,0.03479,0.036629,0.050757,0.419546,0.153418,0.151771,0.86729,0.640426,159.494385,5.641016,0.0,10462,1,1,0.876454,0,0,1,1
1,0.049653,0.034238,0.037082,0.05069,0.416622,0.15415,0.14881,0.861778,0.63194,159.534912,5.609135,0.0,10462,1,2,0.876454,0,0,1,1
2,0.048762,0.035108,0.037101,0.050481,0.42003,0.153041,0.146607,0.86067,0.632753,159.575806,5.577742,0.0,10462,1,3,0.876454,0,0,1,1
3,0.049704,0.034355,0.034933,0.048831,0.40962,0.150611,0.141517,0.85434,0.622288,159.617081,5.54684,0.0,10462,1,4,0.876454,0,0,1,1
4,0.04921,0.034067,0.03529,0.049329,0.409516,0.152017,0.145445,0.854506,0.622727,159.658707,5.516435,0.0,10462,1,5,0.876454,0,0,1,1
