In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

total = pd.read_csv('/content/drive/MyDrive/data/DACON_가스공급량_수요예측_모델개발/train.csv', encoding = 'cp949')
total.head()

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,A,2497.129
1,2013-01-01,2,A,2363.265
2,2013-01-01,3,A,2258.505
3,2013-01-01,4,A,2243.969
4,2013-01-01,5,A,2344.105


In [None]:
total['구분'].unique()

array(['A', 'B', 'C', 'D', 'E', 'G', 'H'], dtype=object)

In [None]:
d_map = dict()
for i, d in enumerate(total['구분'].unique()):
  d_map[d] = i
# 라벨 인코딩

total['구분'] = total['구분'].map(d_map)

total.head()

Unnamed: 0,연월일,시간,구분,공급량
0,2013-01-01,1,0,2497.129
1,2013-01-01,2,0,2363.265
2,2013-01-01,3,0,2258.505
3,2013-01-01,4,0,2243.969
4,2013-01-01,5,0,2344.105


In [None]:
total['연월일'] = pd.to_datetime(total['연월일'])

total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [None]:
train_years = [2013, 2014, 2015, 2016, 2017]
val_years = [2018]

train = total[total['year'].isin(train_years)]
val = total[total['year'].isin(val_years)]

In [None]:
features = ['구분', 'month', 'day', 'weekday', '시간']

train_x = train[features]
train_y = train['공급량']

val_x = val[features]
val_y = val['공급량']

In [None]:
d_train = lgb.Dataset(train_x, train_y)
d_val = lgb.Dataset(val_x, val_y)

params = {
    'objective': 'regression',
    'metric':'mae',
    'seed':42
}

model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)

Training until validation scores don't improve for 10 rounds.
[20]	valid_0's l1: 244.857
[40]	valid_0's l1: 174.855
[60]	valid_0's l1: 158.739
[80]	valid_0's l1: 153.323
[100]	valid_0's l1: 150.948
[120]	valid_0's l1: 150.463
Early stopping, best iteration is:
[112]	valid_0's l1: 150.297


In [None]:
model.predict(val_x)

array([1692.95022318, 1552.98266028, 1517.79609818, ...,  545.14642348,
        502.77442403,  426.92784381])

In [None]:
val_y

306768    1765.008
306769    1679.186
306770    1610.885
306771    1604.123
306772    1711.506
            ...   
368083     681.033
368084     669.961
368085     657.941
368086     610.953
368087     560.896
Name: 공급량, Length: 61320, dtype: float64

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(model.predict(val_x), val_y)**0.5

261.63653886371725

In [None]:
test = pd.read_csv('/content/drive/MyDrive/data/DACON_가스공급량_수요예측_모델개발/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/data/DACON_가스공급량_수요예측_모델개발/sample_submission.csv')

In [None]:
test.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [None]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [None]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday


test['구분'] = test['구분'].map(d_map)

test_x = test[features]
test_x.head()

Unnamed: 0,구분,month,day,weekday,시간
0,0,1,1,1,1
1,0,1,1,1,2
2,0,1,1,1,3
3,0,1,1,1,4
4,0,1,1,1,5


In [None]:
preds = model.predict(test_x)

In [None]:
sub['공급량'] = preds

In [None]:
sub.to_csv('/content/drive/MyDrive/data/DACON_가스공급량_수요예측_모델개발/sub_baseline.csv', index = False)