In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/content/ORDER_PROD_CUS_REV.csv", encoding='utf-8')
df.head()

In [None]:
df = df.drop(columns = ['order_item_id', 'p_product_id', 'customer_id', 'review_id'])
df.head()

In [None]:
# 데이터 전처리
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_year'] = df['order_purchase_timestamp'].dt.year
df['order_month'] = df['order_purchase_timestamp'].dt.month
df['order_day'] = df['order_purchase_timestamp'].dt.day
df['order_hour'] = df['order_purchase_timestamp'].dt.hour


In [None]:
# 계절 정보를 컬럼으로 추가
# 봄 : 9, 10, 11, 여름 : 12, 1, 2, 가을 : 3, 4, 5, 겨울 : 6, 7, 8
def get_season(month):
    if month in [12, 1, 2]:
        return 'Summer'
    elif month in [3, 4, 5]:
        return 'Autumn'
    elif month in [6, 7, 8]:
        return 'Winter'
    else:
        return 'Spring'

df['season'] = df['order_month'].apply(get_season)
df.head()

In [None]:
df['order_date'] = df['order_purchase_timestamp'].dt.date
df['order_day_of_week'] = df['order_purchase_timestamp'].dt.dayofweek
df.head()

In [None]:
# 0: 주중, 1: 주말
df['is_weekend'] = df['order_purchase_timestamp'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)
df.head()

In [None]:
practice_selected = ['order_date', 'product_category_name', 'season', 'is_weekend']
practice_df = df[practice_selected]
practice_df.head()

In [None]:
practice_df['order_date'] = pd.to_datetime(practice_df['order_date'])

demand_df = practice_df.groupby(['order_date', 'product_category_name', 'season', 'is_weekend']).size().reset_index(name='demand')

demand_df.head(10)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

test_start_date = '2019-07-01'
test_end_date = '2019-07-31'

test_data = demand_df[(demand_df['order_date'] >= test_start_date) & (demand_df['order_date'] <= test_end_date)]

# 2019년 7월 데이터 제거
train_end_date = '2019-06-30'
demand_df_filtered = demand_df[demand_df['order_date'] <= train_end_date]

def map_season(season):
    if season == 'Spring':
        return 1
    elif season == 'Summer':
        return 2
    elif season == 'Autumn':
        return 3
    elif season == 'Winter':
        return 4
    else:
        return 0  # 예외 처리

demand_df_filtered['season'] = demand_df_filtered['season'].apply(map_season)

def is_weekend(date):
    day_of_week = date.weekday()  # 0: 월요일, 6: 일요일
    return 1 if day_of_week >= 5 else 0  # 토요일(5), 일요일(6)은 주말로 처리

category_groups = demand_df_filtered.groupby('product_category_name')

results = pd.DataFrame()

In [None]:
scalers = {
    'y': MinMaxScaler(feature_range=(0, 1)),
    'season': MinMaxScaler(feature_range=(0, 1)),
    'is_weekend': MinMaxScaler(feature_range=(0, 1))
}

In [None]:
# 타임 스텝 설정 (최근 날짜로부터 기간 정함)
time_step = 365

class DemandDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

for category, group in tqdm(category_groups, desc="Processing categories"):
    if category in test_data['product_category_name'].unique():  # 테스트 데이터셋에 해당 카테고리가 있는 경우에만 예측 수행

        group = group.rename(columns={'order_date': 'ds', 'demand': 'y', 'season': 'season'})
        group['is_weekend'] = group['ds'].apply(is_weekend)

        features = group[['y', 'season', 'is_weekend']]
        target = group['y']

        features_scaled = features.copy()
        for feature in features.columns:
            features_scaled[feature] = scalers[feature].fit_transform(features[[feature]])

        target_scaled = scalers['y'].fit_transform(target.values.reshape(-1, 1))

        X = []
        y = []
        for i in range(len(features_scaled) - time_step):
            X.append(features_scaled.iloc[i:i + time_step].values)
            y.append(target_scaled[i + time_step])

        X = np.array(X)
        y = np.array(y)

        if X.shape[0] == 0 or X.shape[1] == 0:
            print(f"Skipping category {category} due to insufficient data.")
            continue

        dataset = DemandDataset(X, y)
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

        class LSTMModel(nn.Module):
            def __init__(self, input_size, hidden_size, num_layers):
                super(LSTMModel, self).__init__()
                self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
                self.fc = nn.Linear(hidden_size, 1)

            def forward(self, x):
                h_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
                c_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(device)
                out, _ = self.lstm(x, (h_0, c_0))
                out = self.fc(out[:, -1, :])
                return out

        input_size = X.shape[2]
        hidden_size = 50
        num_layers = 2
        num_epochs = 10
        learning_rate = 0.001

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = LSTMModel(input_size, hidden_size, num_layers).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        model.train()
        for epoch in range(num_epochs):
            for inputs, labels in dataloader:
                inputs = inputs.to(device).float()
                labels = labels.to(device).float()

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if (epoch+1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

        future_dates = pd.date_range(start='2019-07-01', end='2019-07-31', freq='D')
        future = pd.DataFrame({'ds': future_dates})

        future['season'] = 4
        future['is_weekend'] = future['ds'].apply(is_weekend)

        future_features = future[['season', 'is_weekend']]
        future_features['y'] = 0  # 미래 데이터에 y를 0으로 설정
        for feature in future_features.columns:
            future_features[feature] = scalers[feature].transform(future_features[[feature]])

        # 마지막 훈련 데이터로부터 미래 예측을 위한 입력 데이터 생성
        future_X = []
        last_train_data = features_scaled.iloc[-time_step:].values
        for i in range(len(future_features)):
            input_data = np.vstack([last_train_data, future_features.iloc[:i+1].values])[-time_step:]
            future_X.append(input_data)

        future_X = np.array(future_X)

        model.eval()
        with torch.no_grad():
            future_X_tensor = torch.tensor(future_X).to(device).float()
            future_predictions = model(future_X_tensor)
            future_predictions = future_predictions.cpu().numpy()
            future_predictions = scalers['y'].inverse_transform(future_predictions)

        future['product_category_name'] = category
        future['order_date'] = future['ds']
        future['demand'] = future_predictions
        results = pd.concat([results, future[['order_date', 'product_category_name', 'demand']]])

In [None]:
results.tail()

In [None]:
# 모든 날짜와 카테고리의 조합 생성
all_dates = pd.date_range(start=test_start_date, end=test_end_date, freq='D')
all_categories = test_data['product_category_name'].unique()
all_combinations = pd.MultiIndex.from_product([all_dates, all_categories], names=['order_date', 'product_category_name']).to_frame(index=False)

results = pd.merge(all_combinations, results, on=['order_date', 'product_category_name'], how='left')
test_data = pd.merge(all_combinations, test_data, on=['order_date', 'product_category_name'], how='left')


merged_results = pd.merge(results, test_data, on=['order_date', 'product_category_name'], how='inner')

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score, mean_squared_error

mae = mean_absolute_error(merged_results['demand_x'].fillna(0), merged_results['demand_y'].fillna(0))
r2 = r2_score(merged_results['demand_x'].fillna(0), merged_results['demand_y'].fillna(0))
rmse = mean_squared_error(merged_results['demand_x'].fillna(0), merged_results['demand_y'].fillna(0), squared=False)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")
print(f"Root Mean Squared Error (RMSE): {rmse}")