# Baseline 기반 제주 특산물 가격 예측

## Import

In [None]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore') 


In [19]:
!pip install lightgbm
!pip install xgboost

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.1.0-py3-none-win_amd64.whl (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 6.0 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.1.0
Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.0.2-py3-none-win_amd64.whl (99.8 MB)
     ---------------------------------------- 99.8/99.8 MB 5.8 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.2


In [33]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor, AdaBoostRegressor, HistGradientBoostingRegressor

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

## Data Pre-Processing

In [4]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [5]:
train_df.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),year,month,day
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,5


In [9]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

test_x = test_df.drop(columns=['ID', 'timestamp'])

In [10]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [16]:
train_x

Unnamed: 0,item,corporation,location,year,month,day
0,4,0,0,2019,1,1
1,4,0,0,2019,1,2
2,4,0,0,2019,1,3
3,4,0,0,2019,1,4
4,4,0,0,2019,1,5
...,...,...,...,...,...,...
59392,3,5,0,2023,2,27
59393,3,5,0,2023,2,28
59394,3,5,0,2023,3,1
59395,3,5,0,2023,3,2


## Regression Model Fit

In [11]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [21]:
model2 = XGBRegressor()
model2.fit(train_x, train_y)

In [29]:
model3 = LGBMRegressor()
model3.fit(train_x, train_y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001312 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 59397, number of used features: 6
[LightGBM] [Info] Start training from score 1131.680674


In [34]:
model4 = AdaBoostRegressor()
model4.fit(train_x, train_y)

In [37]:
model5 = SVR()
model5.fit(train_x, train_y)

## Inference

In [30]:
pred3 = model3.predict(test_x)
pred3

array([3367.8479935 , 3284.67876764, 3284.67876764, ...,  461.65788988,
        461.65788988,  461.65788988])

In [38]:
pred5 = model5.predict(test_x)
pred5

array([0.14044249, 0.14047509, 0.14050766, ..., 0.1307027 , 0.13073488,
       0.13076705])

In [36]:
pred4 = model4.predict(test_x)
pred4

array([3119.84626883, 3119.84626883, 3119.84626883, ...,  688.16417244,
        688.16417244,  688.16417244])

In [23]:
pred2 = model2.predict(test_x)
pred2

array([3327.9978 , 2981.7285 , 2168.1333 , ..., -135.34961,  662.40485,
        940.6251 ], dtype=float32)

In [24]:
preds = model.predict(test_x)
preds

array([3636.64, 4109.28,  401.82, ...,  439.31,  436.  ,  425.91])

## Submission

In [14]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [26]:
submission['answer'] = preds
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3636.64
1,TG_A_J_20230305,4109.28
2,TG_A_J_20230306,401.82
3,TG_A_J_20230307,3277.93
4,TG_A_J_20230308,3067.03
...,...,...
1087,RD_F_J_20230327,271.27
1088,RD_F_J_20230328,441.28
1089,RD_F_J_20230329,439.31
1090,RD_F_J_20230330,436.00


In [27]:
submission['answer'] = pred2
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3327.997803
1,TG_A_J_20230305,2981.728516
2,TG_A_J_20230306,2168.133301
3,TG_A_J_20230307,3015.339600
4,TG_A_J_20230308,3012.901611
...,...,...
1087,RD_F_J_20230327,-282.096405
1088,RD_F_J_20230328,499.539368
1089,RD_F_J_20230329,-135.349609
1090,RD_F_J_20230330,662.404846


In [31]:
submission['answer'] = pred3
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3367.847993
1,TG_A_J_20230305,3284.678768
2,TG_A_J_20230306,3284.678768
3,TG_A_J_20230307,3304.342663
4,TG_A_J_20230308,3318.958448
...,...,...
1087,RD_F_J_20230327,461.657890
1088,RD_F_J_20230328,461.657890
1089,RD_F_J_20230329,461.657890
1090,RD_F_J_20230330,461.657890


In [39]:
submission['answer'] = pred5
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0.140442
1,TG_A_J_20230305,0.140475
2,TG_A_J_20230306,0.140508
3,TG_A_J_20230307,0.140540
4,TG_A_J_20230308,0.140573
...,...,...
1087,RD_F_J_20230327,0.130638
1088,RD_F_J_20230328,0.130671
1089,RD_F_J_20230329,0.130703
1090,RD_F_J_20230330,0.130735


In [40]:
submission.to_csv('./baseline_submission4.csv', index=False)

# 

# Autogluon를 이용한 예측 (basslibrary님의 코드참고)

In [None]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredicton

In [None]:
train_df = pd.read_csv('./train.csv')[['ID','timestamp','supply(kg)', 'price(원/kg)']]
# test_df = pd.read_csv('test.csv')[['ID','timestamp']]
train_df.head()

In [None]:
train2 = pd.read_csv('./train.csv')

In [None]:
train_df['item_id'] = train_df.ID.str[0:6]
# test_df['item_id'] = test_df.ID.str[0:6]
train_df.head()

In [None]:
data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)
# seed 고정
predictor.fit( data, random_seed=42, )

In [None]:
data.head()

In [None]:
predictor.refit_full()

In [None]:
# seed 고정
pred = predictor.predict(data, random_seed=42, )
pred

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0
submission.to_csv('./submission5.csv', index=False)
submission