In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gc

In [2]:
# csv파일을 parquet파일로 변환하기 위한 함수

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet('./' + save_name + '.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [5]:
# 파일 형식 변경

csv_to_parquet('C:/Users/knzss/Documents/ds_study/project/ds5-ml-repo-1/data/train.csv', 'train')
csv_to_parquet('C:/Users/knzss/Documents/ds_study/project/ds5-ml-repo-1/data/test.csv', 'test')

train Done.
test Done.


In [7]:
# 파일 로드
# test파일은 target이 아에 없으므로 train에서 나눠서 사용

train = pd.read_parquet('./train.parquet')
# test = pd.read_parquet('./test.parquet')

In [8]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,2,107,태평로,0,0,50.0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


In [9]:
# 데이터 전처리
# [요일, 시작 지점의 회전제한 유무, 도착지점의 회전제한 유무]
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    print(i, le.classes_)

day_of_week ['금' '목' '수' '월' '일' '토' '화']
start_turn_restricted ['없음' '있음']
end_turn_restricted ['없음' '있음']


In [10]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,33.427747,126.662612,0,제3교래교,33.427749,126.662335,0,52.0
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,33.50073,126.529107,1,KAL사거리,33.504811,126.52624,0,30.0
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,33.279145,126.368598,0,상창육교,33.280072,126.362147,0,61.0
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,0,남양리조트,33.246081,126.567204,0,서현주택,33.245565,126.566228,0,20.0
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,33.462214,126.326551,0,애월입구,33.462677,126.330152,0,38.0


In [11]:
# 주말인지 아닌지 판별을 위한 컬럼 추가
train['weekend']=0
train.loc[train['day_of_week'] == 4,'weekend'] = 1
train.loc[train['day_of_week'] == 5,'weekend'] = 1

In [12]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,weekend
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,제3교래교,33.427747,126.662612,0,제3교래교,33.427749,126.662335,0,52.0,0
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,광양사거리,33.50073,126.529107,1,KAL사거리,33.504811,126.52624,0,30.0,0
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,창고천교,33.279145,126.368598,0,상창육교,33.280072,126.362147,0,61.0,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,남양리조트,33.246081,126.567204,0,서현주택,33.245565,126.566228,0,20.0,0
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,애월샷시,33.462214,126.326551,0,애월입구,33.462677,126.330152,0,38.0,0


In [13]:
# 월 컬럼 추가
train['month']=train['base_date']
train['month'] = train['month'].apply(lambda x: int(str(x)[4:6]))

In [14]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,weekend,month
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,33.427747,126.662612,0,제3교래교,33.427749,126.662335,0,52.0,0,6
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,33.50073,126.529107,1,KAL사거리,33.504811,126.52624,0,30.0,0,7
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,33.279145,126.368598,0,상창육교,33.280072,126.362147,0,61.0,1,10
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,33.246081,126.567204,0,서현주택,33.245565,126.566228,0,20.0,0,3
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,33.462214,126.326551,0,애월입구,33.462677,126.330152,0,38.0,0,10


In [15]:
# 정리한 데이터프레임 저장

train.to_parquet('./train_pwm.parquet')
print('Done.')

Done.


In [16]:
import pandas as pd

train = pd.read_parquet('./train_pwm.parquet')
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,weekend,month
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,33.427747,126.662612,0,제3교래교,33.427749,126.662335,0,52.0,0,6
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,33.50073,126.529107,1,KAL사거리,33.504811,126.52624,0,30.0,0,7
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,33.279145,126.368598,0,상창육교,33.280072,126.362147,0,61.0,1,10
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,33.246081,126.567204,0,서현주택,33.245565,126.566228,0,20.0,0,3
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,33.462214,126.326551,0,애월입구,33.462677,126.330152,0,38.0,0,10


In [18]:
from pytimekr import pytimekr

year = [2021, 2022]

chuseok = []
lunar_newyear = []
y_list = []
x_list = []

for df_year in year:
    # 대체 있음
    c = pytimekr.red_days(pytimekr.chuseok(year = df_year))                     # 추석
    l = pytimekr.red_days(pytimekr.lunar_newyear(year = df_year))               # 설날

    for idx, item in enumerate(c):
        chuseok.append(int(''.join(str(item).split('-'))))

    for idx, item in enumerate(l):
        lunar_newyear.append(int(''.join(str(item).split('-'))))


    y_list.append(int(''.join(str(pytimekr.hangul(year = df_year)).split('-'))))                      # 한글날
    y_list.append(int(''.join(str(pytimekr.children(year = df_year)).split('-'))))                    # 어린이날
    y_list.append(int(''.join(str(pytimekr.independence(year = df_year)).split('-'))))                # 광복절
    y_list.append(int(''.join(str(pytimekr.samiljeol(year = df_year)).split('-'))))                   # 삼일절


    # 대체 없음
    x_list.append(int(''.join(str(pytimekr.memorial(year = df_year)).split('-'))))                    # 현충일
    x_list.append(int(''.join(str(pytimekr.buddha(year = df_year)).split('-'))))                      # 석가탄신일
    x_list.append(int(''.join(str(pytimekr.constitution(year = df_year)).split('-'))))                # 제헌절
    x_list.append(int(''.join(str(pytimekr.christmas(year = df_year)).split('-'))))                   # 크리스마스
    x_list.append(int(''.join(str(pytimekr.newyear(year = df_year)).split('-'))))                     # 1월 1일

holidays_dic = {
                    'chuseok' : chuseok,
                    'lunar_newyear' : lunar_newyear,
                    'y_list' : y_list,
                    'x_list' : x_list
}


holidays_list = []

for i in holidays_dic.values():
    holidays_list.extend(i)
    
holidays_list

[20210920,
 20210921,
 20210922,
 20220909,
 20220910,
 20220911,
 20210211,
 20210212,
 20210213,
 20220131,
 20220201,
 20220202,
 20211009,
 20210505,
 20210815,
 20210301,
 20221009,
 20220505,
 20220815,
 20220301,
 20210606,
 20210519,
 20210717,
 20211225,
 20210101,
 20220606,
 20220508,
 20220717,
 20221225,
 20220101]

In [19]:
# 데이터프레임에서 날짜를 확인하고 리스트안에 있을 경우 주말 컬럼을 1로 바꿔줌
# 주말컬럼에 공휴일 쉬는날을 포함하는 형태

for date in holidays_list:
    train.loc[train['base_date'] == date,'weekend'] = 1

In [20]:
train.to_parquet('C:/Users/knzss/Documents/ds_study/project/ds5-ml-repo-1/data/train_fin.parquet')
print('Done.')

Done.


In [None]:
# 월, 시간대 원핫인코딩
train = pd.concat([train, pd.get_dummies(train['month'])], axis = 1)
train.rename(columns = {1 : 'm_1', 2 : 'm_2', 3 : 'm_3', 4 : 'm_4', 5 : 'm_5', 6 : 'm_6', 7 : 'm_7', 8 : 'm_8', 9 : 'm_9', 10 : 'm_10', 11 : 'm_11', 12 : 'm_12'}, inplace = True)

# 22시 ~ 4시
# 4시 10시
# 10시 16시
# 16시 22시
train = pd.concat([train, pd.get_dummies(train['base_hour'].apply(lambda x : int((x + 2) / 6) if x < 22 else 0))] ,axis = 1).drop(['base_hour', 'month'], axis = 1)
train.rename(columns = {0 : 'h_0', 1 : 'h_1', 2 : 'h_2', 3 : 'h_3'}, inplace = True)