In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc

In [2]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet('./' + save_name + '.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [3]:
# pyarrow 설치 해야함, 9.0.0
csv_to_parquet('C:/Users/soy66/Downloads/open/open/train.csv', 'train')
csv_to_parquet('C:/Users/soy66/Downloads/open/open/test.csv', 'test')

train Done.
test Done.


In [4]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

In [5]:
# [요일, 시작 지점의 회전제한 유무, 도착지점의 회전제한 유무]
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
            
    print(i, le.classes_)
    test[i]=le.transform(test[i])

day_of_week ['금' '목' '수' '월' '일' '토' '화']
start_turn_restricted ['126.5066933' '126.5689231' '126.5692232' '126.5812301' '126.5815291'
 '33.25667094' '33.51983236' '33.52126402' '없음' '있음']
end_turn_restricted ['126.5063927' '126.5066933' '126.5070142' '126.5815291' '126.5918723'
 '33.25667094' '33.49671062' '33.49686337' '33.50147749' '33.50162703'
 '33.51983236' '33.52109762' '33.52126402' '33.52139902' '서호동' '없음' '있음']


In [6]:
y_train = train['target'] 

# [아이디, 날짜, 평균속도, 도로명, 시작지점명, 도착지점명, 통과제한차량]
X_train = train.drop(['id','base_date', 'target','road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

test = test.drop(['id','base_date', 'road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

print(X_train.shape)
print(y_train.shape)
print(test.shape)


(1048575, 20)
(1048575,)
(291241, 16)


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 27 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   id                     1048575 non-null  object 
 1   base_date              1048575 non-null  int64  
 2   day_of_week            1048575 non-null  int32  
 3   base_hour              1048575 non-null  int64  
 4   lane_count             1048575 non-null  int64  
 5   road_rating            1048575 non-null  int64  
 6   road_name              1048575 non-null  object 
 7   multi_linked           1048575 non-null  int64  
 8   connect_code           1048575 non-null  int64  
 9   maximum_speed_limit    1048575 non-null  int64  
 10  vehicle_restricted     1048575 non-null  int64  
 11  weight_restricted      1048575 non-null  int64  
 12  height_restricted      1048575 non-null  int64  
 13  road_type              1048575 non-null  int64  
 14  start_node_name   

In [10]:
A=train[train['road_name']=='-'].index

In [None]:
#df_train.drop(A,axis='index',inplace=True)

In [11]:
train.drop(A,axis='index',inplace=True)

In [13]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,8,제3교래교,33.42774877,126.662335,15,52,,,,
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,9,KAL사거리,33.5048113,126.5262401,15,30,,,,
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,8,상창육교,33.2800721,126.3621475,15,61,,,,
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,8,서현주택,33.2455654,126.5662282,15,20,,,,
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,8,애월입구,33.46267677,126.3301518,15,38,,,,


In [14]:
train=train.drop(['Unnamed: 23','Unnamed: 24','Unnamed: 25','Unnamed: 26'],axis=1)

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 921337 entries, 0 to 1048574
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   id                     921337 non-null  object
 1   base_date              921337 non-null  int64 
 2   day_of_week            921337 non-null  int32 
 3   base_hour              921337 non-null  int64 
 4   lane_count             921337 non-null  int64 
 5   road_rating            921337 non-null  int64 
 6   road_name              921337 non-null  object
 7   multi_linked           921337 non-null  int64 
 8   connect_code           921337 non-null  int64 
 9   maximum_speed_limit    921337 non-null  int64 
 10  vehicle_restricted     921337 non-null  int64 
 11  weight_restricted      921337 non-null  int64 
 12  height_restricted      921337 non-null  int64 
 13  road_type              921337 non-null  int64 
 14  start_node_name        921337 non-null  object
 15 

In [16]:
# 도로명 중 중복값 
train['road_name']

0          지방도1112호선
1           일반국도11호선
2           일반국도16호선
3                태평로
4           일반국도12호선
             ...    
1048570     일반국도12호선
1048571    지방도1132호선
1048572        관광단지로
1048573         새서귀로
1048574          번영로
Name: road_name, Length: 921337, dtype: object