In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from lightgbm import LGBMRegressor

In [2]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet('./' + save_name + '.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [3]:
# pyarrow 설치 해야함, 9.0.0
csv_to_parquet('C:/Users/soy66/Downloads/open/open/train.csv', 'train')
csv_to_parquet('C:/Users/soy66/Downloads/open/open/test.csv', 'test')

train Done.
test Done.


In [4]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

In [5]:
# [요일, 시작 지점의 회전제한 유무, 도착지점의 회전제한 유무]
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
            
    print(i, le.classes_)
    test[i]=le.transform(test[i])

day_of_week ['금' '목' '수' '월' '일' '토' '화']
start_turn_restricted ['126.5066933' '126.5689231' '126.5692232' '126.5812301' '126.5815291'
 '33.25667094' '33.51983236' '33.52126402' '없음' '있음']
end_turn_restricted ['126.5063927' '126.5066933' '126.5070142' '126.5815291' '126.5918723'
 '33.25667094' '33.49671062' '33.49686337' '33.50147749' '33.50162703'
 '33.51983236' '33.52109762' '33.52126402' '33.52139902' '서호동' '없음' '있음']


In [6]:
y_train = train['target'] 

# [아이디, 날짜, 평균속도, 도로명, 시작지점명, 도착지점명, 통과제한차량]
X_train = train.drop(['id','base_date', 'target','road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

test = test.drop(['id','base_date', 'road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

print(X_train.shape)
print(y_train.shape)
print(test.shape)


(1048575, 20)
(1048575,)
(291241, 16)


In [7]:
A=train[train['road_name']=='-'].index

In [8]:
train.drop(A,axis='index',inplace=True)

In [9]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,TRAIN_0000000,20220623,1,17,1,106,지방도1112호선,0,0,60,...,8,제3교래교,33.42774877,126.662335,15,52,,,,
1,TRAIN_0000001,20220728,1,21,2,103,일반국도11호선,0,0,60,...,9,KAL사거리,33.5048113,126.5262401,15,30,,,,
2,TRAIN_0000002,20211010,4,7,2,103,일반국도16호선,0,0,80,...,8,상창육교,33.2800721,126.3621475,15,61,,,,
3,TRAIN_0000003,20220311,0,13,2,107,태평로,0,0,50,...,8,서현주택,33.2455654,126.5662282,15,20,,,,
4,TRAIN_0000004,20211005,6,8,2,103,일반국도12호선,0,0,80,...,8,애월입구,33.46267677,126.3301518,15,38,,,,


In [10]:
train=train.drop(['Unnamed: 23','Unnamed: 24','Unnamed: 25','Unnamed: 26'],axis=1)

In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   day_of_week            1048575 non-null  int32  
 1   base_hour              1048575 non-null  int64  
 2   lane_count             1048575 non-null  int64  
 3   road_rating            1048575 non-null  int64  
 4   multi_linked           1048575 non-null  int64  
 5   connect_code           1048575 non-null  int64  
 6   maximum_speed_limit    1048575 non-null  int64  
 7   weight_restricted      1048575 non-null  int64  
 8   height_restricted      1048575 non-null  int64  
 9   road_type              1048575 non-null  int64  
 10  start_latitude         1048575 non-null  object 
 11  start_longitude        1048575 non-null  object 
 12  start_turn_restricted  1048575 non-null  int32  
 13  end_latitude           1048575 non-null  object 
 14  end_longitude     

In [11]:
# 도로명 중 중복값 
#train[train['road_name'].duplicated()]

In [12]:
# 4번 일요일 5번 토요일
train['weekend']=0
train.loc[train['day_of_week'] == 4,'weekend'] = 1
train.loc[train['day_of_week'] == 5,'weekend'] = 1

In [17]:
train['start_latitude'].unique()

array(['33.42774727', '33.50073043', '33.2791451', '33.24608087',
       '33.46221435', '33.2499487', '33.48239171', '33.25307382',
       '33.36171667', '33.4194234', '33.24850523', '33.48570693',
       '33.26411158', '33.47800071', '33.4858849', '33.45242292',
       '33.46222797', '33.46531818', '33.50010333', '33.27816804',
       '33.51846796', '33.25194718', '33.29173073', '33.25060425',
       '33.48194055', '33.48005319', '33.43917961', '33.51439002',
       '33.2455654', '33.39397204', '33.48570433', '33.49007787',
       '33.4710608', '33.45358486', '33.24487138', '33.46492456',
       '33.45173081', '33.52029216', '33.40618113', '33.2996578',
       '33.41294463', '33.48366058', '33.51972627', '33.26506545',
       '33.24699594', '33.38173029', '33.29174712', '33.47505427',
       '33.47349412', '33.43379415', '33.25218326', '33.24760701',
       '33.51157865', '33.25678487', '33.25462828', '33.40891283',
       '33.41947947', '33.3115379', '33.28563828', '33.47630837',
   

In [14]:

d_train = lgb.Dataset(X_train, label=y_train)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'mean absolute error'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10
clf = lgb.train(params, d_train, 100)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: start_latitude, start_longitude, end_latitude, end_longitude, Unnamed: 23, Unnamed: 25

In [None]:
y_pred=clf.predict(x_test)

In [None]:
params = {
    'n_estimators': [500, 1000], 
    'learning_rate': [0.1, 0.05, 0.01], 
    'max_depth': [7, 8], 
    'colsample_bytree': [0.8, 0.9], 
    'subsample': [0.8, 0.9,],
}


In [None]:
grid_search = GridSearchCV(LGBMRegressor(), params, cv=3, n_jobs=-1, scoring='mean absolute error')
grid_search.fit(X_train, y_train)
