In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/코랩파일들/제주 특산물 가격예측 AI/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.tabular[all]==0.8.2 (from autogluon)
  Downloading autogluon.tabular-0.8.2-py3-none-any.whl (285 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.7/285.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==0.8.2 (from autogluon)
  Downloading autogluon.multimodal-0.8.2-py3-none-any.whl (372 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [None]:
reset_seeds(SEED)

In [None]:
train_df = pd.read_csv(f'{DATA_PATH}train.csv')
test_df = pd.read_csv(f'{DATA_PATH}test.csv')

In [None]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))


test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [None]:
date = pd.to_datetime(train_df['timestamp'])
train_df['weekday'] = date.dt.weekday
train_df['timestamp'] = date

In [None]:
date = pd.to_datetime(test_df['timestamp'])
test_df['weekday'] = date.dt.weekday
test_df['timestamp'] = date

In [None]:
train_df['holiday'] = train_df.apply(lambda x : 0 if x['weekday']==6 else 1, axis = 1)
test_df['holiday'] = test_df.apply(lambda x : 0 if x['weekday']==6 else 1, axis = 1)

In [None]:
# 2022 추석
lst = [10,11,12]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2022)&(train_df['month'] == 9)&(train_df['day'] == i)].index,'holiday'] = 0
#2021 추석
lst = [20,21,22,23]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2021)&(train_df['month'] == 9)&(train_df['day'] == i)].index,'holiday'] = 0
#2020 추석
lst = [1,2,3]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2020)&(train_df['month'] == 10)&(train_df['day'] == i)].index,'holiday'] = 0
#2019 추석
lst = [13,14]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2019)&(train_df['month'] == 9)&(train_df['day'] == i)].index,'holiday'] = 0


In [None]:
lst = [23,24]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2023)&(train_df['month'] == 1)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [1,2,3]
train_df.loc[train_df[(train_df['year'] == 2022)&(train_df['month'] == 1)&(train_df['day'] == 31)].index,'holiday'] = 0
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2022)&(train_df['month'] == 2)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [12,13]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2021)&(train_df['month'] == 2)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [25,27]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2020)&(train_df['month'] == 1)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [5,6,7]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2019)&(train_df['month'] == 2)&(train_df['day'] == i)].index,'holiday'] = 0

In [None]:
train_df.loc[train_df[(train_df['month']==1)&(train_df['day']==1)].index,'holiday'] =0
train_df.loc[train_df[(train_df['month']==1)&(train_df['day']==2)].index,'holiday'] =0

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
cnt = 0
for i in train_df['item'].unique():
  for c in train_df['corporation'].unique():
    for l in train_df['location'].unique():
      if len(test_df[(test_df['item'] == f'{i}')&(test_df['corporation'] == f'{c}')&(test_df['location'] == f'{l}')])>0:
        cnt+=1
print(cnt)

39


In [None]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

  warn(f"Failed to load image Python extension: {e}")


In [None]:
train_df['item_id'] = train_df.ID.str[0:6]

In [None]:
# test_tmp = test_df[(test_df['item'] == f'TG')&(test_df['corporation'] == f'A')&(test_df['location'] == f'J')]

In [None]:
# len(test_tmp[test_tmp['holiday'] != 0])

In [None]:
tmp = train_df
# tmp = tmp[tmp['holiday'] !=0]
tmp2 = tmp[['item_id','timestamp','price(원/kg)']]
tmp2.columns = ['item_id','timestamp','target']
# tmp2.loc[tmp2.index,'answer'] = scaler.fit_transform(tmp2[['answer']])
train = TimeSeriesDataFrame(tmp2)

In [None]:
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="target",
    # known_covariates_names=["weekend"],
    eval_metric="RMSE",
).fit(train,random_seed=42,num_val_windows=3)

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 3,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'target',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
INFO:lightning_fabric.utilities.seed:Global seed set to 42
AutoGluon will save models to AutogluonModels/ag-20231117_072844/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'target'

Starting training. Start time is 2023-11-17 07:28:44
Models that will be trained: ['Naive', 'Seas

In [None]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	4.58    s     = Training runtime
Fitting model: WeightedEnsemble_FULL | Skipping fit via cloning parent ...
Refit complete. Models trained: ['Naive_FULL', 'SeasonalNaive_FULL', 'Theta_FULL', 'AutoETS_FULL', 'RecursiveTabular_FULL', 'WeightedEnsemble_FULL']
Total runtime: 4.61 s
Updated best model to 'WeightedEnsemble_FULL' (Previously 'WeightedEnsemble'). AutoGluon will default to using 'WeightedEnsemble_FULL' for p

{'Naive': 'Naive_FULL',
 'SeasonalNaive': 'SeasonalNaive_FULL',
 'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [None]:
# predictor.load('/content/AutogluonModels/ag-20231117_071503')

Loading predictor from path /content/AutogluonModels/ag-20231117_071503/


<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7ccb2b9abfd0>

In [None]:
pred = predictor.predict(train,random_seed=42)
pred

INFO:lightning_fabric.utilities.seed:Global seed set to 42
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3167.042284,220.308693,1192.342746,1897.169100,2589.705110,3204.105273,3833.021330,4467.786591,5199.592912,6293.366505
TG_A_J,2023-03-05,826.020498,-3003.145623,-1698.620439,-762.637161,6.554368,702.886502,1478.016250,2271.374061,3190.748624,4436.211358
TG_A_J,2023-03-06,3134.168862,-1416.567340,190.418022,1297.594232,2249.986242,3101.471358,3922.010131,4865.518627,5946.103074,7455.330069
TG_A_J,2023-03-07,3437.529443,-1564.179974,147.585075,1338.315209,2370.314428,3315.926578,4314.251413,5333.870446,6499.714916,8208.660192
TG_A_J,2023-03-08,3323.272899,-1925.108597,-147.784228,1177.340610,2275.588508,3292.780636,4307.138732,5450.314874,6838.789546,8703.260525
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,526.979213,-302.871661,-23.675675,176.278863,356.705732,524.102231,678.044346,850.770926,1055.516306,1353.855917
RD_F_J,2023-03-28,533.608739,-313.011538,-22.577012,185.798565,361.649113,536.879700,710.548945,881.481389,1082.131787,1369.039205
RD_F_J,2023-03-29,535.264083,-346.099823,-54.462343,173.808724,352.311502,526.610484,688.830796,877.886436,1101.044231,1386.909474
RD_F_J,2023-03-30,512.756577,-359.204850,-59.257339,153.112531,337.316024,503.490191,671.452134,855.384408,1075.641642,1370.869551


In [None]:
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="target",
    # known_covariates_names=["weekend"],
    eval_metric="RMSE",
).fit(train,random_seed=42,num_val_windows=4)

predictor.refit_full()

pred2 = predictor.predict(train,random_seed=42)

In [None]:
submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [None]:
submission['answer'] = (pred.reset_index()['mean']+pred2.reset_index()['mean'])/2
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3206.337570
1,TG_A_J_20230305,812.234220
2,TG_A_J_20230306,3173.686550
3,TG_A_J_20230307,3534.862396
4,TG_A_J_20230308,3391.974334
...,...,...
1087,RD_F_J_20230327,554.415691
1088,RD_F_J_20230328,567.039689
1089,RD_F_J_20230329,576.506261
1090,RD_F_J_20230330,555.541773


In [None]:
submission.loc[test_df[test_df['holiday'] == 0].index,'answer'] = 0
submission.loc[test_df[test_df['holiday'] == 0].index]

Unnamed: 0,ID,answer
1,TG_A_J_20230305,0.0
8,TG_A_J_20230312,0.0
15,TG_A_J_20230319,0.0
22,TG_A_J_20230326,0.0
29,TG_A_S_20230305,0.0
...,...,...
1058,CB_F_J_20230326,0.0
1065,RD_F_J_20230305,0.0
1072,RD_F_J_20230312,0.0
1079,RD_F_J_20230319,0.0


In [None]:
submission.loc[submission[submission['answer'] < 0].index]

Unnamed: 0,ID,answer


In [None]:
submission.loc[submission[submission['answer'] < 0].index,'answer'] = 0
submission.loc[submission[submission['answer'] < 0].index]

Unnamed: 0,ID,answer


In [None]:
submission.to_csv('autogluon.csv', index=False)