<a href="https://colab.research.google.com/github/Dkepffl/Dacon/blob/main/HD_AI_Challenge/HD_FE_1017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **HD 현대 AI Chalenge**
- 구글 드라이브 어마운트를 통해 데이터 로드
- 리더보드 최저 점수(26.8103796658) 코드


## **| 구글 드라이브 어마운트**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **| 필요한 라이브러리 로드**

In [None]:
# import libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os

## **| 작업 폴더 변경 및 데이터 로드**

In [None]:
os.chdir("/content/drive/MyDrive/HD현대 AI Challenge") # 각자 드라이브에 맞춰 변경
os.getcwd()

'/content/drive/MyDrive/HD현대 AI Challenge'

In [None]:
train = pd.read_csv('Data/train.csv').drop('SAMPLE_ID', axis=1)
test = pd.read_csv('Data/test.csv').drop('SAMPLE_ID', axis=1)

In [None]:
# 원본 데이터셋 별도 저장
train_proto = train.copy()
test_proto = test.copy()

## **| 데이터 전처리**

### **1. ATA 변수 처리**

In [None]:
from datetime import datetime

# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
  df['YEAR'] = df['ATA'].dt.year
  df['MONTH'] = df['ATA'].dt.month
  df['DAY'] = df['ATA'].dt.day
  df['HOUR'] = df['ATA'].dt.hour
  df['MINUTE'] = df['ATA'].dt.minute
  df['WEEKDAY'] = df['ATA'].dt.weekday

In [None]:
train.drop('ATA', axis=1, inplace=True)
test.drop('ATA', axis=1, inplace=True)

### **2. 파생 변수 추가**

In [None]:
port_list1=train['ARI_PO'].unique().tolist()
port_list2=test['ARI_PO'].unique().tolist()

#### **항구별 월 BN 평균값 및 표준편차**

In [None]:
BN_dist_train=train.groupby(['ARI_PO','MONTH'])['BN'].agg(['mean','std']).reset_index()
BN_dist_test=test.groupby(['ARI_PO','MONTH'])['BN'].agg(['mean','std']).reset_index()

In [None]:
BN_dist_train.fillna(-1, inplace=True)
BN_dist_test.fillna(-1, inplace=True)

In [None]:
# 오류 발생! 모든 달의 BN 값이 없는 항구 있음
for i in range(1,13):
  for port in port_list1:
      train.loc[(train['MONTH']==i)&(train['ARI_PO']==port),'BN_MEAN'] = BN_dist_train.loc[(BN_dist_train['MONTH']==i)&(BN_dist_train['ARI_PO']==port),'mean'].values[0]
      train.loc[(train['MONTH']==i)&(train['ARI_PO']==port),'BN_STD'] = BN_dist_train.loc[(BN_dist_train['MONTH']==i)&(BN_dist_train['ARI_PO']==port),'std'].values[0]

In [None]:
for i in range(1,13):
  for port in port_list2:
    test.loc[(test['MONTH']==i)&(test['ARI_PO']==port),'BN_MEAN'] = BN_dist_test.loc[(BN_dist_test['MONTH']==i)&(BN_dist_test['ARI_PO']==port),'mean'].values[0]
    test.loc[(test['MONTH']==i)&(test['ARI_PO']==port),'BN_STD'] = BN_dist_test.loc[(BN_dist_test['MONTH']==i)&(BN_dist_test['ARI_PO']==port),'std'].values[0]

In [None]:
train.info()

#### **ATA_PO 별 DIST 통계량**

In [None]:
DIST_dist_train=train.groupby('ARI_PO')['DIST'].agg(['mean','std']).reset_index()
DIST_dist_test=train.groupby('ARI_PO')['DIST'].agg(['mean','std']).reset_index()

In [None]:
DIST_dist_train.loc[DIST_dist_train['ARI_PO']=='EKP8','mean'].values[0]

14.408828688066436

In [None]:
for port in port_list1:
  train.loc[train['ARI_PO']==port,'DIST_MEAN'] = DIST_dist_train.loc[DIST_dist_train['ARI_PO']==port,'mean'].values[0]
  train.loc[train['ARI_PO']==port,'DIST_STD'] = DIST_dist_train.loc[DIST_dist_train['ARI_PO']==port,'std'].values[0]

for port in port_list2:
  test.loc[test['ARI_PO']==port,'DIST_MEAN'] = DIST_dist_test.loc[DIST_dist_test['ARI_PO']==port,'mean'].values[0]
  test.loc[test['ARI_PO']==port,'DIST_STD'] = DIST_dist_test.loc[DIST_dist_test['ARI_PO']==port,'std'].values[0]

### **3. 카테고리 변수 라벨 인코딩**

In [None]:
from sklearn.preprocessing import LabelEncoder
import bisect
from tqdm import tqdm

categorical_features = ['ARI_CO','ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER','FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
  le = LabelEncoder()
  train[feature] = le.fit_transform(train[feature].astype(str))
  le_classes_set = set(le.classes_)
  test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
  le_classes = le.classes_.tolist()
  bisect.insort_left(le_classes, '-1')
  le.classes_ = np.array(le_classes)
  test[feature] = le.transform(test[feature].astype(str))
  encoders[feature] = le

Encoding features: 100%|██████████| 6/6 [00:02<00:00,  2.92it/s]


### **4. 미사용 컬럼 제거**

In [None]:
train.drop(columns=['ID', 'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BUILT'], inplace=True)
test.drop(columns=['ID', 'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BUILT'], inplace=True)

In [None]:
train = train.fillna(-1)
test = test.fillna(-1)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 391939 entries, 0 to 391938
Data columns (total 22 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ARI_CO              391939 non-null  int64  
 1   ARI_PO              391939 non-null  int64  
 2   SHIP_TYPE_CATEGORY  391939 non-null  int64  
 3   DIST                391939 non-null  float64
 4   BREADTH             391939 non-null  float64
 5   DEADWEIGHT          391939 non-null  int64  
 6   DEPTH               391939 non-null  float64
 7   DRAUGHT             391939 non-null  float64
 8   GT                  391939 non-null  int64  
 9   LENGTH              391939 non-null  float64
 10  BN                  391939 non-null  float64
 11  ATA_LT              391939 non-null  int64  
 12  PORT_SIZE           391939 non-null  float64
 13  CI_HOUR             391939 non-null  float64
 14  YEAR                391939 non-null  int64  
 15  MONTH               391939 non-nul

### **5. 스케일링**

In [None]:
from sklearn.preprocessing import StandardScaler

scl = StandardScaler()

train_scld = scl.fit_transform(train.drop('CI_HOUR', axis=1))
train_scld = pd.DataFrame(train_scld, columns=train.drop('CI_HOUR', axis=1).columns)

test_scld = scl.transform(test)
test_scld = pd.DataFrame(test_scld, columns=test.columns)

## **| 모델 학습**

### **학습/검증 데이터셋 생성**

In [None]:
from sklearn.model_selection import train_test_split
X = train_scld
Y = train["CI_HOUR"]
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

### **Catboost 모델링**

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
from catboost import CatBoostRegressor

cbc = CatBoostRegressor(verbose=1000, iterations=20000, objective='MAE', use_best_model=True)
cbc.fit(x_train, y_train, eval_set=(x_valid, y_valid))

0:	learn: 60.8150604	test: 59.7491016	best: 59.7491016 (0)	total: 228ms	remaining: 1h 16m
1000:	learn: 47.0460331	test: 46.1682759	best: 46.1682759 (1000)	total: 1m 46s	remaining: 33m 39s
2000:	learn: 46.3540212	test: 45.5526967	best: 45.5526967 (2000)	total: 3m 16s	remaining: 29m 26s
3000:	learn: 45.4783012	test: 44.7827878	best: 44.7827878 (3000)	total: 4m 50s	remaining: 27m 25s
4000:	learn: 45.4193002	test: 44.7453294	best: 44.7453154 (3952)	total: 6m 21s	remaining: 25m 24s
5000:	learn: 45.4149869	test: 44.7433669	best: 44.7433425 (4981)	total: 7m 51s	remaining: 23m 34s
6000:	learn: 45.4112235	test: 44.7412501	best: 44.7412348 (5985)	total: 9m 22s	remaining: 21m 53s
7000:	learn: 45.4075146	test: 44.7389137	best: 44.7389137 (7000)	total: 10m 53s	remaining: 20m 12s
8000:	learn: 45.4038410	test: 44.7364812	best: 44.7364812 (8000)	total: 12m 23s	remaining: 18m 35s
9000:	learn: 45.4033102	test: 44.7361238	best: 44.7361238 (8979)	total: 13m 56s	remaining: 17m 2s
10000:	learn: 45.4033102	t

<catboost.core.CatBoostRegressor at 0x7ba9a0b872b0>

In [None]:
importances = cbc.get_feature_importance(type='PredictionValuesChange')
feature_importances = pd.Series(importances, index=X.columns).sort_values()

In [None]:
feature_importances

MINUTE                 0.047171
DAY                    0.098137
DRAUGHT                0.143943
ATA_LT                 0.189719
HOUR                   0.317135
MONTH                  0.405596
DEPTH                  0.943819
ARI_PO                 2.038877
DIST_MEAN              2.150668
BN                     2.323883
YEAR                   3.811057
DIST_STD               4.189727
DEADWEIGHT             4.577290
WEEKDAY                4.906688
LENGTH                 5.138632
BREADTH                5.215433
SHIP_TYPE_CATEGORY     6.352288
PORT_SIZE              6.492508
GT                    10.321805
ARI_CO                13.189242
DIST                  27.146382
dtype: float64

In [None]:
from sklearn.metrics import mean_absolute_error

pred = cbc.predict(x_valid)
mean_absolute_error(pred, y_valid)

44.73612472241793

In [None]:
# test 데이터셋에 대한 예측
final = cbc.predict(test_scld)

## **| 제출 파일 생성 및 저장**

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/HD현대 AI Challenge/Data/sample_submission.csv')
submit['CI_HOUR'] = final

In [None]:
submit.to_csv('YOUN/sub_FE1017(6).csv', index=False) # 파일 이름 수정