## [Dacon] AI프렌즈 시즌2 강수량 산출 경진대회
## giba.kim (팀명)
## 2020년 5월 29일 (제출날짜)

### Train, Test feather Convert Code
https://dacon.io/competitions/official/235591/codeshare/902?page=1&dtype=recent&ptype=pub

## 1. 라이브러리 및 데이터
## Library & Data

In [10]:
import os
from tqdm import tqdm
import tqdm
import numpy as np
import pandas as pd

In [13]:
INPUT_COL = [f"temp{i}" for i in range(1, 10)] + ['type', 'long_GMI', 'lat_GMI', 'long_DPR', 'lat_DPR'] 
TARGET_COL = ['precipitation']
TEST_COL = INPUT_COL + ['orbit', 'subset', 'pixel']
TRAIN_COL = INPUT_COL + TARGET_COL + ['orbit', 'subset', 'pixel']

In [14]:
TRAIN_DIR_PATH = '../input/train/'
TEST_DIR_PATH = '../input/test/'
TRAIN_FEATHER_PATH = '../input/train.ftr'
TEST_FEATHER_PATH = '../input/test.ftr'

In [15]:
PIXEL_COL = np.arange(1, 1601)[:, None]

def add_pad(nd, file):
    orbit, subset = file.split('_')[1:]
    subset = subset[:2]
    nd = np.pad(nd, ((0, 0), (0, 1)), constant_values=int(orbit))
    nd = np.pad(nd, ((0, 0), (0, 1)), constant_values=int(subset))
    return np.c_[nd, PIXEL_COL]

def make_dataframe(dir_path, is_train):
    np_list = []
    file_list = os.listdir(dir_path)
    for file in tqdm(file_list):
        
        file_path = dir_path + file
        nd = np.load(file_path).astype(np.float32)  # 40 x 40 x ?
        dim = nd.shape[-1]
        nd = nd.reshape(-1, dim)                    # 1600    x ?
        nd = add_pad(nd, file)
        
        np_list.append(nd)
    
    feature_col = TRAIN_COL.copy()
    if is_train is False:
        feature_col = TEST_COL.copy()
        
    df = pd.DataFrame(np.vstack(np_list), columns=feature_col, dtype=np.float32)
    df[['orbit', 'subset', 'pixel']] = df[['orbit', 'subset', 'pixel']].astype(np.int32)
    df.sort_values(by=['orbit', 'subset', 'pixel'], ignore_index=True, inplace=True)
    return df

In [5]:
train_df = make_dataframe(TRAIN_DIR_PATH, True)

100%|███████████████████████████████████████████████████████████████████████████| 76345/76345 [07:33<00:00, 168.16it/s]


In [16]:
test_df = make_dataframe(TEST_DIR_PATH, False)

100%|████████████████████████████████████████████████████████████████████████████| 2416/2416 [00:01<00:00, 1378.41it/s]


In [11]:
train_df.to_feather(TRAIN_FEATHER_PATH)
test_df.to_feather(TEST_FEATHER_PATH)

In [12]:
train_df.shape

(122152000, 18)

In [13]:
train_df.head()

Unnamed: 0,temp1,temp2,temp3,temp4,temp5,temp6,temp7,temp8,temp9,type,long_GMI,lat_GMI,long_DPR,lat_DPR,precipitation,orbit,subset,pixel
0,174.677109,97.509834,203.541229,143.567032,241.500748,222.171906,170.156082,276.060486,259.399963,0.0,159.494385,5.641016,159.641464,5.574192,0.0,10462,1,1
1,174.911652,96.689583,203.99794,143.49678,240.993332,222.62175,169.170502,274.717133,256.79895,0.0,159.534912,5.609135,159.641464,5.574192,0.0,10462,1,2
2,173.597321,97.981293,204.016159,143.279175,241.584793,221.94043,168.437149,274.44696,257.048035,0.0,159.575806,5.577742,159.641464,5.574192,0.0,10462,1,3
3,174.98616,96.862984,201.832352,141.562363,239.77803,220.447968,166.742813,272.904358,253.840561,0.0,159.617081,5.54684,159.625137,5.532823,0.0,10462,1,4
4,174.257904,96.435356,202.192291,142.080582,239.759964,221.311798,168.050186,272.944885,253.975052,0.0,159.658707,5.516435,159.669113,5.51032,0.0,10462,1,5
