In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gc

In [2]:
# csv파일을 parquet파일로 변환하기 위한 함수

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet('./' + save_name + '.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [3]:
# 파일 형식 변경

csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


In [4]:
# 파일 로드
# test파일은 target이 아에 없으므로 train에서 나눠서 사용

train = pd.read_parquet('./train.parquet')
# test = pd.read_parquet('./test.parquet')

In [5]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,2,107,태평로,0,0,50.0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


In [6]:
# 데이터 전처리
# [요일, 시작 지점의 회전제한 유무, 도착지점의 회전제한 유무]
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    print(i, le.classes_)

day_of_week ['금' '목' '수' '월' '일' '토' '화']
start_turn_restricted ['없음' '있음']
end_turn_restricted ['없음' '있음']


In [7]:
# 주말인지 아닌지 판별을 위한 컬럼 추가
train['weekend']=0
train.loc[train['day_of_week'] == 4,'weekend'] = 1
train.loc[train['day_of_week'] == 5,'weekend'] = 1

In [8]:
# 월 컬럼 추가
train['month']=train['base_date']
train['month'] = train['month'].apply(lambda x: int(str(x)[4:6]))

In [9]:
train['month'].unique()

array([ 6,  7, 10,  3,  9,  1, 12,  5,  2, 11], dtype=int64)

In [10]:
train['weather']=0

In [11]:
#계절별 분류 month를
train.loc[train['month']==12,'weather']=1
train.loc[train['month']==1,'weather']=1
train.loc[train['month']==2,'weather']=1

In [12]:
train['weather'].unique()

array([0, 1], dtype=int64)

In [13]:
train.loc[train['month']==3,'weather']=2
train.loc[train['month']==4,'weather']=2
train.loc[train['month']==5,'weather']=2

train.loc[train['month']==6,'weather']=3
train.loc[train['month']==7,'weather']=3
train.loc[train['month']==8,'weather']=3

train.loc[train['month']==9,'weather']=4
train.loc[train['month']==10,'weather']=4
train.loc[train['month']==11,'weather']=4


In [14]:
# 3~5 봄(2) 6~8 여름(3) 9~11 가을(4) 12~2 겨울(1) 

In [15]:
train['weather'].unique()

array([3, 4, 2, 1], dtype=int64)

In [16]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,weekend,month,weather
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,126.662612,0,제3교래교,33.427749,126.662335,0,52.0,0,6,3
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,126.529107,1,KAL사거리,33.504811,126.52624,0,30.0,0,7,3
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,126.368598,0,상창육교,33.280072,126.362147,0,61.0,1,10,4
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,126.567204,0,서현주택,33.245565,126.566228,0,20.0,0,3,2
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,126.326551,0,애월입구,33.462677,126.330152,0,38.0,0,10,4


In [17]:
# 4시간씩 잘라봄

In [18]:
train['base_time']=0

In [19]:
train['base_hour'].unique()

array([17, 21,  7, 13,  8,  0, 16, 15,  2, 11, 10, 22, 12, 23, 14,  6,  9,
       19,  1,  4, 20, 18,  3,  5], dtype=int64)

In [20]:
train.loc[(train['base_hour']  >= 0)&(train['base_hour'] <= 4), 'base_time']=0

train.loc[(train['base_hour']  >= 5)&(train['base_hour'] <= 9), 'base_time']=1

train.loc[(train['base_hour']  >= 10)&(train['base_hour'] <= 14), 'base_time']=2

train.loc[(train['base_hour']  >= 15)&(train['base_hour'] <= 19), 'base_time']=3

train.loc[train['base_hour']  >= 20,'base_time']=4

In [21]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,weekend,month,weather,base_time
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,0,제3교래교,33.427749,126.662335,0,52.0,0,6,3,3
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,1,KAL사거리,33.504811,126.52624,0,30.0,0,7,3,4
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,0,상창육교,33.280072,126.362147,0,61.0,1,10,4,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,0,서현주택,33.245565,126.566228,0,20.0,0,3,2,2
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,0,애월입구,33.462677,126.330152,0,38.0,0,10,4,1


In [22]:
train.columns

Index(['id', 'base_date', 'day_of_week', 'base_hour', 'lane_count',
       'road_rating', 'road_name', 'multi_linked', 'connect_code',
       'maximum_speed_limit', 'vehicle_restricted', 'weight_restricted',
       'height_restricted', 'road_type', 'start_node_name', 'start_latitude',
       'start_longitude', 'start_turn_restricted', 'end_node_name',
       'end_latitude', 'end_longitude', 'end_turn_restricted', 'target',
       'weekend', 'month', 'weather', 'base_time'],
      dtype='object')

In [46]:
train['lane_count'].unique()

array([1, 2, 3], dtype=int64)

In [None]:
vehicle_restricted,height_restricted # array([0.])

In [None]:
# train2  deeopcopy 및 위도 경도 삭제 

In [22]:
import copy

In [23]:
train2=copy.deepcopy(train)

In [24]:
train2

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,weekend,month,weather,base_time
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,0,제3교래교,33.427749,126.662335,0,52.0,0,6,3,3
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,1,KAL사거리,33.504811,126.526240,0,30.0,0,7,3,4
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,0,상창육교,33.280072,126.362147,0,61.0,1,10,4,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,0,서현주택,33.245565,126.566228,0,20.0,0,3,2,2
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,0,애월입구,33.462677,126.330152,0,38.0,0,10,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,TRAIN_4701212,20211104,1,16,1,107,-,0,0,50.0,...,0,금덕해운,33.420955,126.273750,0,20.0,0,11,4,3
4701213,TRAIN_4701213,20220331,1,2,2,107,-,0,0,80.0,...,0,광삼교,33.472525,126.424890,0,65.0,0,3,2,0
4701214,TRAIN_4701214,20220613,3,22,2,103,일반국도12호선,0,0,60.0,...,0,성산교차로,33.444121,126.912948,0,30.0,0,6,3,4
4701215,TRAIN_4701215,20211020,2,2,2,103,일반국도95호선,0,0,80.0,...,0,관광대학입구,33.444996,126.433332,0,73.0,0,10,4,0


In [26]:
train2=train2.drop(['end_latitude','end_longitude','start_latitude','start_longitude'],axis=1)

In [28]:
train2.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_turn_restricted,end_node_name,end_turn_restricted,target,weekend,month,weather,base_time
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,0,제3교래교,0,52.0,0,6,3,3
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,1,KAL사거리,0,30.0,0,7,3,4
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,0,상창육교,0,61.0,1,10,4,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,0,남양리조트,0,서현주택,0,20.0,0,3,2,2
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,0,애월입구,0,38.0,0,10,4,1


In [29]:
# 정리한 데이터프레임 저장

train2.to_parquet('./train2_pwm.parquet')
print('Done.')

Done.


In [30]:
train2 = pd.read_parquet('./train2_pwm.parquet')
train2.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_turn_restricted,end_node_name,end_turn_restricted,target,weekend,month,weather,base_time
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,0,제3교래교,0,52.0,0,6,3,3
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,1,KAL사거리,0,30.0,0,7,3,4
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,0,상창육교,0,61.0,1,10,4,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,0,남양리조트,0,서현주택,0,20.0,0,3,2,2
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,0,애월입구,0,38.0,0,10,4,1


In [25]:
train['connect_code'].unique()

array([  0, 103], dtype=int64)

In [None]:
# maximum_speed_limit = 최고속도제한

# weight_restricted = 통과제한하중

# height_restricted = 통과제한높이 connect_code = 연결로 코드  지울수 있는 컬럼 

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4701217 entries, 0 to 4701216
Data columns (total 27 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   id                     object 
 1   base_date              int64  
 2   day_of_week            int32  
 3   base_hour              int64  
 4   lane_count             int64  
 5   road_rating            int64  
 6   road_name              object 
 7   multi_linked           int64  
 8   connect_code           int64  
 9   maximum_speed_limit    float64
 10  vehicle_restricted     float64
 11  weight_restricted      float64
 12  height_restricted      float64
 13  road_type              int64  
 14  start_node_name        object 
 15  start_latitude         float64
 16  start_longitude        float64
 17  start_turn_restricted  int32  
 18  end_node_name          object 
 19  end_latitude           float64
 20  end_longitude          float64
 21  end_turn_restricted    int32  
 22  target            

In [24]:
# 정리한 데이터프레임 저장

train.to_parquet('./train_pwm.parquet')
print('Done.')

Done.


In [25]:

train = pd.read_parquet('./train_pwm.parquet')
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,weekend,month,weather,base_time
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60.0,...,0,제3교래교,33.427749,126.662335,0,52.0,0,6,3,3
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60.0,...,1,KAL사거리,33.504811,126.52624,0,30.0,0,7,3,4
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80.0,...,0,상창육교,33.280072,126.362147,0,61.0,1,10,4,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50.0,...,0,서현주택,33.245565,126.566228,0,20.0,0,3,2,2
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80.0,...,0,애월입구,33.462677,126.330152,0,38.0,0,10,4,1
