In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc

In [2]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet('./' + save_name + '.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [3]:
# pyarrow 설치 해야함, 9.0.0
csv_to_parquet('C:/Users/soy66/Downloads/open/open/train.csv', 'train')
csv_to_parquet('C:/Users/soy66/Downloads/open/open/test.csv', 'test')

train Done.
test Done.


In [4]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

In [5]:
# [요일, 시작 지점의 회전제한 유무, 도착지점의 회전제한 유무]
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
            
    print(i, le.classes_)
    test[i]=le.transform(test[i])

day_of_week ['금' '목' '수' '월' '일' '토' '화']
start_turn_restricted ['126.5066933' '126.5689231' '126.5692232' '126.5812301' '126.5815291'
 '33.25667094' '33.51983236' '33.52126402' '없음' '있음']
end_turn_restricted ['126.5063927' '126.5066933' '126.5070142' '126.5815291' '126.5918723'
 '33.25667094' '33.49671062' '33.49686337' '33.50147749' '33.50162703'
 '33.51983236' '33.52109762' '33.52126402' '33.52139902' '서호동' '없음' '있음']


In [6]:
y_train = train['target'] 

# [아이디, 날짜, 평균속도, 도로명, 시작지점명, 도착지점명, 통과제한차량]
X_train = train.drop(['id','base_date', 'target','road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

test = test.drop(['id','base_date', 'road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

print(X_train.shape)
print(y_train.shape)
print(test.shape)


(1048575, 20)
(1048575,)
(291241, 16)


In [8]:
A=train[train['road_name']=='-'].index

In [9]:
#df_train.drop(A,axis='index',inplace=True)

In [10]:
train.drop(A,axis='index',inplace=True)

In [11]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,8,제3교래교,33.42774877,126.662335,15,52,,,,
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,9,KAL사거리,33.5048113,126.5262401,15,30,,,,
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,8,상창육교,33.2800721,126.3621475,15,61,,,,
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,8,서현주택,33.2455654,126.5662282,15,20,,,,
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,8,애월입구,33.46267677,126.3301518,15,38,,,,


In [12]:
train=train.drop(['Unnamed: 23','Unnamed: 24','Unnamed: 25','Unnamed: 26'],axis=1)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 921337 entries, 0 to 1048574
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   id                     921337 non-null  object
 1   base_date              921337 non-null  int64 
 2   day_of_week            921337 non-null  int32 
 3   base_hour              921337 non-null  int64 
 4   lane_count             921337 non-null  int64 
 5   road_rating            921337 non-null  int64 
 6   road_name              921337 non-null  object
 7   multi_linked           921337 non-null  int64 
 8   connect_code           921337 non-null  int64 
 9   maximum_speed_limit    921337 non-null  int64 
 10  vehicle_restricted     921337 non-null  int64 
 11  weight_restricted      921337 non-null  int64 
 12  height_restricted      921337 non-null  int64 
 13  road_type              921337 non-null  int64 
 14  start_node_name        921337 non-null  object
 15 

In [14]:
# 도로명 중 중복값 
train[train['road_name'].duplicated()]

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
8,TRAIN_0000008,20211004,3,15,2,107,경찰로,0,0,60,...,0,신성교회,33.25307382,126.5063927,8,서호2차현대맨션203동,33.25218326,126.5060688,15,14
9,TRAIN_0000009,20211208,2,2,1,103,일반국도16호선,0,0,50,...,0,양수장,33.36171667,126.7669579,8,제2가시교,33.36433621,126.7694089,15,52
13,TRAIN_0000013,20220507,5,7,2,103,일반국도16호선,0,0,60,...,0,서홍교,33.26411158,126.5540433,8,서흥동사무소,33.26368492,126.5509785,15,28
15,TRAIN_0000015,20220501,4,16,1,103,일반국도16호선,0,0,30,...,0,아라초등학교앞,33.47800071,126.5438429,8,제2아라교,33.47744524,126.5427027,15,32
18,TRAIN_0000018,20220319,5,16,2,106,중산간서로,0,0,70,...,0,광령3교차로,33.46222797,126.4236391,8,고성교차로,33.45796664,126.4100768,15,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,TRAIN_1048570,20220510,6,22,3,103,일반국도12호선,0,0,50,...,3,거로마을,33.51439002,126.5559133,8,별도교,33.51370852,126.5544828,15,37
1048571,TRAIN_1048571,20210907,6,1,3,106,지방도1132호선,0,0,70,...,0,수모루교차로,33.24699594,126.5243579,8,용당교차로,33.25510608,126.5408316,15,65
1048572,TRAIN_1048572,20220712,6,14,1,107,관광단지로,0,0,30,...,0,선방석재개발(주),33.25100258,126.4121633,8,선방석재개발(주),33.25181211,126.4126065,15,33
1048573,TRAIN_1048573,20211219,4,1,2,107,새서귀로,0,0,60,...,0,오름식당,33.25364754,126.5114968,8,부동산,33.25453079,126.5118101,15,29


In [23]:
train[train['day_of_week']==5]

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
13,TRAIN_0000013,20220507,5,7,2,103,일반국도16호선,0,0,60,...,0,서홍교,33.26411158,126.5540433,8,서흥동사무소,33.26368492,126.5509785,15,28
18,TRAIN_0000018,20220319,5,16,2,106,중산간서로,0,0,70,...,0,광령3교차로,33.46222797,126.4236391,8,고성교차로,33.45796664,126.4100768,15,56
23,TRAIN_0000023,20220205,5,7,2,107,새서귀로,0,0,60,...,0,한솔고기국수,33.25194718,126.5108937,8,삼주연립101동,33.25104534,126.5105738,15,33
37,TRAIN_0000037,20220702,5,19,3,103,일반국도16호선,0,0,50,...,0,연동사거리,33.48570433,126.4964515,8,신제주초교입구오거리,33.48358859,126.4963679,16,24
54,TRAIN_0000054,20220115,5,2,3,103,일반국도16호선,0,0,70,...,0,제주일보사거리,33.49007787,126.486558,9,남녕고앞사거리,33.48839527,126.4835066,16,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048522,TRAIN_1048522,20220312,5,13,2,106,지방도1118호선,0,0,50,...,3,진은교,33.28608305,126.7209549,8,진은교,33.28563828,126.7209754,15,48
1048525,TRAIN_1048525,20210918,5,13,1,106,지방도1115호선,0,0,60,...,0,제5산록교,33.29174712,126.5050097,8,제6산록교,33.29140408,126.513613,15,52
1048537,TRAIN_1048537,20220528,5,8,2,103,일반국도12호선,0,0,80,...,0,새마을창고,33.53912789,126.8265276,8,한동교차로,33.54008902,126.8251426,15,46
1048549,TRAIN_1048549,20210904,5,12,3,103,일반국도99호선,0,0,70,...,0,제주일고,33.47895896,126.4823071,8,부영아파트,33.47630837,126.4833214,16,33


In [15]:
#train['base_date_datetime'] = pd.to_datetime(train['base_date'],format='%Y%m%d')


In [16]:
# train 의 날짜 슬라이싱 하기
# train 의 

In [17]:
#train['base_date_datetime'] =pd.to_datetime(train['base_date'])

In [18]:
#train[train['base_date_datetime'][-1:-8]]

In [30]:
# 주말 컬럼 집어넣기
train['sat']=train['day_of_week']==5

ValueError: Columns must be same length as key

In [31]:
train['sun']=train['day_of_week']==4

In [32]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,sat,sun
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,33.42774727,126.662612,8,제3교래교,33.42774877,126.662335,15,52,False,False
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,33.50073043,126.5291068,9,KAL사거리,33.5048113,126.5262401,15,30,False,False
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,33.2791451,126.3685977,8,상창육교,33.2800721,126.3621475,15,61,False,True
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,33.24608087,126.5672043,8,서현주택,33.2455654,126.5662282,15,20,False,False
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,33.46221435,126.3265511,8,애월입구,33.46267677,126.3301518,15,38,False,False


In [33]:
train

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,sat,sun
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,33.42774727,126.662612,8,제3교래교,33.42774877,126.662335,15,52,False,False
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,33.50073043,126.5291068,9,KAL사거리,33.5048113,126.5262401,15,30,False,False
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,33.2791451,126.3685977,8,상창육교,33.2800721,126.3621475,15,61,False,True
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,33.24608087,126.5672043,8,서현주택,33.2455654,126.5662282,15,20,False,False
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,33.46221435,126.3265511,8,애월입구,33.46267677,126.3301518,15,38,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,TRAIN_1048570,20220510,6,22,3,103,일반국도12호선,0,0,50,...,33.51439002,126.5559133,8,별도교,33.51370852,126.5544828,15,37,False,False
1048571,TRAIN_1048571,20210907,6,1,3,106,지방도1132호선,0,0,70,...,33.24699594,126.5243579,8,용당교차로,33.25510608,126.5408316,15,65,False,False
1048572,TRAIN_1048572,20220712,6,14,1,107,관광단지로,0,0,30,...,33.25100258,126.4121633,8,선방석재개발(주),33.25181211,126.4126065,15,33,False,False
1048573,TRAIN_1048573,20211219,4,1,2,107,새서귀로,0,0,60,...,33.25364754,126.5114968,8,부동산,33.25453079,126.5118101,15,29,False,True


In [38]:
train.loc[train['sat'] == False,'sat'] = 0

In [39]:
train.loc[train['sat'] == True,'sat'] = 1

In [40]:
train

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,sat,sun
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,33.42774727,126.662612,8,제3교래교,33.42774877,126.662335,15,52,0,False
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,33.50073043,126.5291068,9,KAL사거리,33.5048113,126.5262401,15,30,0,False
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,33.2791451,126.3685977,8,상창육교,33.2800721,126.3621475,15,61,0,True
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,33.24608087,126.5672043,8,서현주택,33.2455654,126.5662282,15,20,0,False
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,33.46221435,126.3265511,8,애월입구,33.46267677,126.3301518,15,38,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,TRAIN_1048570,20220510,6,22,3,103,일반국도12호선,0,0,50,...,33.51439002,126.5559133,8,별도교,33.51370852,126.5544828,15,37,0,False
1048571,TRAIN_1048571,20210907,6,1,3,106,지방도1132호선,0,0,70,...,33.24699594,126.5243579,8,용당교차로,33.25510608,126.5408316,15,65,0,False
1048572,TRAIN_1048572,20220712,6,14,1,107,관광단지로,0,0,30,...,33.25100258,126.4121633,8,선방석재개발(주),33.25181211,126.4126065,15,33,0,False
1048573,TRAIN_1048573,20211219,4,1,2,107,새서귀로,0,0,60,...,33.25364754,126.5114968,8,부동산,33.25453079,126.5118101,15,29,0,True


In [41]:
train.loc[train['sun'] == False,'sun'] = 0

In [42]:
train.loc[train['sun'] == True,'sun'] = 1

In [43]:
train

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,sat,sun
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,33.42774727,126.662612,8,제3교래교,33.42774877,126.662335,15,52,0,0
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,33.50073043,126.5291068,9,KAL사거리,33.5048113,126.5262401,15,30,0,0
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,33.2791451,126.3685977,8,상창육교,33.2800721,126.3621475,15,61,0,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,33.24608087,126.5672043,8,서현주택,33.2455654,126.5662282,15,20,0,0
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,33.46221435,126.3265511,8,애월입구,33.46267677,126.3301518,15,38,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,TRAIN_1048570,20220510,6,22,3,103,일반국도12호선,0,0,50,...,33.51439002,126.5559133,8,별도교,33.51370852,126.5544828,15,37,0,0
1048571,TRAIN_1048571,20210907,6,1,3,106,지방도1132호선,0,0,70,...,33.24699594,126.5243579,8,용당교차로,33.25510608,126.5408316,15,65,0,0
1048572,TRAIN_1048572,20220712,6,14,1,107,관광단지로,0,0,30,...,33.25100258,126.4121633,8,선방석재개발(주),33.25181211,126.4126065,15,33,0,0
1048573,TRAIN_1048573,20211219,4,1,2,107,새서귀로,0,0,60,...,33.25364754,126.5114968,8,부동산,33.25453079,126.5118101,15,29,0,1


In [47]:
train.loc[train['day_of_week'] == 5,'weekend'] = 1

In [48]:
train.loc[train['day_of_week'] == 4,'weekend'] = 1

In [46]:
train

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,sat,sun
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,33.42774727,126.662612,8,제3교래교,33.42774877,126.662335,15,52,0,0
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,33.50073043,126.5291068,9,KAL사거리,33.5048113,126.5262401,15,30,0,0
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,33.2791451,126.3685977,8,상창육교,33.2800721,126.3621475,15,61,0,1
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,33.24608087,126.5672043,8,서현주택,33.2455654,126.5662282,15,20,0,0
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,33.46221435,126.3265511,8,애월입구,33.46267677,126.3301518,15,38,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,TRAIN_1048570,20220510,6,22,3,103,일반국도12호선,0,0,50,...,33.51439002,126.5559133,8,별도교,33.51370852,126.5544828,15,37,0,0
1048571,TRAIN_1048571,20210907,6,1,3,106,지방도1132호선,0,0,70,...,33.24699594,126.5243579,8,용당교차로,33.25510608,126.5408316,15,65,0,0
1048572,TRAIN_1048572,20220712,6,14,1,107,관광단지로,0,0,30,...,33.25100258,126.4121633,8,선방석재개발(주),33.25181211,126.4126065,15,33,0,0
1048573,TRAIN_1048573,20211219,4,1,2,107,새서귀로,0,0,60,...,33.25364754,126.5114968,8,부동산,33.25453079,126.5118101,15,29,0,1


In [52]:
train['weekend']=train['weekend'].fillna(0)

In [53]:
train

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,sat,sun,weekend
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,126.662612,8,제3교래교,33.42774877,126.662335,15,52,0,0,0.0
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,126.5291068,9,KAL사거리,33.5048113,126.5262401,15,30,0,0,0.0
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,126.3685977,8,상창육교,33.2800721,126.3621475,15,61,0,1,1.0
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,126.5672043,8,서현주택,33.2455654,126.5662282,15,20,0,0,0.0
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,126.3265511,8,애월입구,33.46267677,126.3301518,15,38,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,TRAIN_1048570,20220510,6,22,3,103,일반국도12호선,0,0,50,...,126.5559133,8,별도교,33.51370852,126.5544828,15,37,0,0,0.0
1048571,TRAIN_1048571,20210907,6,1,3,106,지방도1132호선,0,0,70,...,126.5243579,8,용당교차로,33.25510608,126.5408316,15,65,0,0,0.0
1048572,TRAIN_1048572,20220712,6,14,1,107,관광단지로,0,0,30,...,126.4121633,8,선방석재개발(주),33.25181211,126.4126065,15,33,0,0,0.0
1048573,TRAIN_1048573,20211219,4,1,2,107,새서귀로,0,0,60,...,126.5114968,8,부동산,33.25453079,126.5118101,15,29,0,1,1.0
