# Import package

In [1]:
import requests
import xmltodict,json
from urllib.parse import urlencode, unquote, quote_plus
import re
import matplotlib.pyplot as plt
from collections import Counter
from PIL import Image
import seaborn as sns
import networkx as nx
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
import folium
import plotly.graph_objects as go
from ipywidgets import interact, interact_manual
import ipywidgets as widgets
import plotly.express as px
import cufflinks as cf
cf.go_offline(connected=True)
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import f_regression,chi2, SelectKBest
from sklearn.model_selection import cross_val_score # 이값이 높은걸 선택하면 됨
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
import category_encoders as ce 
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler
#from tensorflow.keras.utils import to_categorical
from category_encoders import OneHotEncoder
#from sklearn.feature_selection import chi2, SelectKBest

# 결과 확인을 용이하게 하기 위한 코드
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# load data

In [2]:
train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

# One-hot Encoding

#### 원핫인코딩은 범주형 데이터 (road_name)를 수치형 데이터로 변환하기 위해 사용

#### train과 test의 road_name 범주가 다르므로 두 데이터를 통합한 뒤 원핫 인코딩을 시도했으나 공모전 주최 측인 DACON에서 Data Leakage에 해당한다 하여 category의 OneHotEncoder 사용 후 진행

In [3]:
#### OneHotEncoder(use_cat_names = True)는 이전에 있는 column의 이름을 가져오기 위해 사용

In [4]:
encoder = OneHotEncoder(use_cat_names = True)
train_onehot = encoder.fit_transform(train[['road_name']])
test_onehot = encoder.transform(test[['road_name']])

In [5]:
train_ohe = pd.concat([train, train_onehot],axis=1)
test_ohe = pd.concat([test, test_onehot],axis=1)

In [6]:
train_ohe.to_csv('train_ohe.csv', sep=',')
test_ohe.to_csv('test_ohe.csv', sep=',')

#### 메모리에 효율적인 데이터 유형을 사용하여 용량을 크게 줄이고 빠른 작업이 가능한 parquet

In [7]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [8]:
csv_to_parquet('./train_ohe.csv', 'train_ohe')
csv_to_parquet('./test_ohe.csv', 'test_ohe')

train_ohe Done.
test_ohe Done.


In [9]:
train_ohe = pd.read_parquet('./train_ohe.parquet')
test_ohe = pd.read_parquet('./test_ohe.parquet')

In [10]:
train_ohe = train_ohe.drop('Unnamed: 0',axis=1)
test_ohe = test_ohe.drop('Unnamed: 0',axis=1)

In [11]:
train_ohe.shape
test_ohe.shape

(4701217, 84)

(291241, 83)

# Label Encoding

#### 범주형 변수를 수치형으로 변환하는 작업 수행
* str_col 모두 없음, 있음의 형태를 0,1로 변환

In [12]:
str_col = ['start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

# Count Encoding

#### 날짜,시작 정류장, 끝 정류장 모두 범주형 데이터이므로 라벨 인코딩

In [13]:
count_features = ['day_of_week','start_node_name', 'end_node_name']

count_encoder = ce.CountEncoder(cols=count_features)
count_encoder.fit(train[count_features])

train = train.join(count_encoder.transform(train[count_features]).add_suffix('_count'))
test = test.join(count_encoder.transform(test[count_features]).add_suffix('_count'))

CountEncoder(cols=['day_of_week', 'start_node_name', 'end_node_name'],
             combine_min_nan_groups=True)

In [14]:
train['day_of_week_count'] = np.log1p(train['day_of_week_count'])
train['start_node_name_count'] = np.log1p(train['start_node_name_count'])
train['end_node_name_count'] = np.log1p(train['end_node_name_count'])

In [15]:
test['day_of_week_count'] = np.log1p(test['day_of_week_count'])
test['start_node_name_count'] = np.log1p(test['start_node_name_count'])
test['end_node_name_count'] = np.log1p(test['end_node_name_count'])

#### start_node_name과 end_node_name을 합쳐 node_name_Combined 라는 새로운 변수를 생성하여 라벨 인코딩을 통해 시작 지점 -> 도착 지점의 경로 라벨 생성

In [16]:
from sklearn.preprocessing import LabelEncoder

str_col = ['day_of_week','start_turn_restricted','end_turn_restricted','start_node_name', 'end_node_name']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [17]:
cols = ['start_node_name', 'end_node_name']
train['node_name_combined'] = train[cols].apply(lambda row: '->'.join(row.values.astype(str)), axis=1)
test['node_name_combined'] = test[cols].apply(lambda row: '->'.join(row.values.astype(str)), axis=1)

str_col = ['node_name_combined']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

## hour rank (시간대 별 평균 속도 변수 생성)

#### 시간대를 기준으로 그룹화 한 뒤 target 평균 계산 후 내림차순 정렬 후 df 생성, hour_rank 열을 추가하여 순위 부여 후 hour_dict 딕셔너리에 저장

In [18]:
hour_dic = {}
df = train.groupby(['base_hour'])['target'].mean().sort_values(ascending=False).to_frame()
df['hour_rank'] = [i + 1 for i in range(df.shape[0])] 
for i in range(df.shape[0]):
    hour_dic[df.index[i]] = i+1

In [19]:
lst = []
for i in tqdm(range(len(train))):
    h = hour_dic[train['base_hour'][i]]
    lst.append(h)
train['hour_rank'] = lst

100%|█████████████████████████████████████████████████████████████████████| 4701217/4701217 [00:56<00:00, 82789.33it/s]


In [20]:
lst = []
for i in tqdm(range(len(test))):
    h = hour_dic[test['base_hour'][i]]
    lst.append(h)
test['hour_rank'] = lst

100%|███████████████████████████████████████████████████████████████████████| 291241/291241 [00:03<00:00, 82027.94it/s]


## day_of_week rank (요일 별 평균 속도 변수 생성)

In [21]:
day_of_week_dic = {}
df1 = train.groupby(['day_of_week'])['target'].mean().to_frame()
df2 = df1.sort_values(by='target', ascending=False)
for i in range(df2.shape[0]):
    day_of_week_dic[df2.index[i]] = df2['target'][df2.index[i]]

In [22]:
from tqdm import tqdm
lst = []
for i in tqdm(range(len(train))):
    h = day_of_week_dic[train['day_of_week'][i]]
    lst.append(h)
train['day_of_week_rank'] = lst

100%|█████████████████████████████████████████████████████████████████████| 4701217/4701217 [00:58<00:00, 80818.62it/s]


In [23]:
lst = []
for i in tqdm(range(len(test))):
    h = day_of_week_dic[test['day_of_week'][i]]
    lst.append(h)
test['day_of_week_rank'] = lst

100%|███████████████████████████████████████████████████████████████████████| 291241/291241 [00:03<00:00, 81471.55it/s]


## train, test 데이터 생성

In [24]:
data = train_ohe.drop(['id','road_name', 'vehicle_restricted', 'height_restricted', 'start_node_name', 'end_node_name'], axis=1)
test2 = test_ohe.drop(['id','road_name', 'vehicle_restricted', 'height_restricted','start_node_name', 'end_node_name'], axis=1)

In [25]:
d_train = data.drop('target', axis=1)
d_test = data["target"]

# data split

In [28]:
d_train = data.drop('target', axis=1)
d_target = train["target"]

In [29]:
x_train, x_test, y_train, y_test = train_test_split(d_train, d_target, random_state=0)

x_train.shape
x_test.shape
y_train.shape
y_test.shape

(3525912, 77)

(1175305, 77)

(3525912,)

(1175305,)

# Mechine learning

## k-fold

In [None]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 2022)

## lgbm

In [None]:
lgb = LGBMRegressor(n_estimators=1500, random_state=2022, min_child_samples = 50)

mae_list = []
lgb_pred = np.zeros((d_test.shape[0]))
for tr_idx, val_idx in tqdm(kf.split(x_train, y_train)) :
    tr_x, tr_y = x_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    
    lgb.fit(tr_x, tr_y)
    
    pred = [0 if x < 0 else x for x in lgb.predict(val_x)]
    pred = np.array(pred)
    sub_pred = [0 if x < 0 else x for x in lgb.predict(d_test)]
    sub_pred = np.array(sub_pred)
    mae = mean_absolute_error(val_y, pred)
    
    mae_list.append(mae)
    
    lgb_pred += (sub_pred / 5)

In [None]:
np.mean(mae_list)

## xgb

In [None]:
XGB = XGBRegressor(max_depth= 4, alpha= 10, n_estimators=100, random_state=42)

mae_list = []
XGB_pred = np.zeros((x_test.shape[0]))
for tr_idx, val_idx in tqdm(kf.split(x_train, y_train)) :
    tr_x, tr_y = x_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    
    XGB.fit(tr_x, tr_y)
    
    pred = [0 if x < 0 else x for x in XGB.predict(val_x)]
    pred = np.array(pred)
    sub_pred = [0 if x < 0 else x for x in XGB.predict(x_test)]
    sub_pred = np.array(sub_pred)
    mae = mean_absolute_error(val_y, pred)
    
    mae_list.append(mae)
    
    lgb_pred += (sub_pred / 5)

In [None]:
np.mean(mae_list)

## Optuna

In [None]:
import optuna
from optuna.samplers import TPESampler

In [None]:
lgb_list = []

def objective_lgb(trial):
    param = {
             'n_estimators' : trial.suggest_int('n_estimators', 300, 3500),
             'depth' : trial.suggest_int('depth', 6, 14),
             'fold_permutation_block' : trial.suggest_int('fold_permutation_block', 1, 256),
             'learning_rate' : trial.suggest_float('learning_rate', 0, 1),
             'od_pval' : trial.suggest_float('od_pval', 0, 1),
             'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 4)
    }
    lgb = LGBMRegressor(**param)
    lgb_model = lgb.fit(x_train, y_train, verbose=True)  
    
    x = x_train
    y = y_train

    lgbpred = lgb.predict(x_test)
    mae = mean_absolute_error(y_test, lgbpred)
    lgb_list.append(mae)
    
    return cat_list

In [None]:
sampler = TPESampler(seed=2022)
study2 = optuna.create_study(
    study_name="lgb_optimizer",
    direction="minimize",
    sampler=sampler,
)
study2.optimize(objective_lgb, n_trials= 5)
print("Best Score:", study2.best_value)
print("Best trial:", study2.best_trial.params)

#### Optuna를 통해 알게된 최적의 하이퍼파라미터로 lgb 모델링

In [None]:
lgb = LGBMRegressor(n_estimators= 329,
                    depth= 10, 
                    fold_permutation_block= 30,
                    learning_rate= 0.049974018177630475,
                    od_pval= 0.6854075942430917, 
                    l2_leaf_reg= 1.9479522728629215
                    ).fit(x_train, y_train)
pred = lgb.predict(x_test)

print("훈련 세트 정확도 : {:.3f}".format(cat.score(x_train,y_train)))
print("테스트 세트 정확도 : {:.3f}".format(cat.score(x_test,y_test)))

In [None]:
mean_absolute_error(y_test,pred)

## Pycaret

In [None]:
train = pd.concat([x_train,y_train],axis = 1)

In [None]:
from pycaret.regression import *
train = setup(data=train, target='target')

#### 여러 모델을 적합하여 성능 비교 (MAE 기준)

In [None]:
top4 = compare_models(sort='MAE')

#### tune_model : 모델의 하이퍼파라미터 튜닝

In [None]:
tuned_top4 = [tune_model(i) for i in top4]

#### blend_models : 여러 모델들을 혼합하여 새로운 모델 생성 가능

In [None]:
blender_top4 = blend_models(estimator_list=tuned_top4)

#### finalize_model : cross_validation을 사용하여 적합한 모델을 전체 데이터로 마지막 학습, 마지막 모델 설정 후 predict_model을 통해 예측

In [None]:
final_model = finalize_model(blender_top4)
prediction = predict_model(final_model, data=test)

## 제출

In [None]:
sample_submission['target'] = lgb_pred
sample_submission.to_csv("./submit.csv", index = False)
sample_submission