# Import Libaries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from itertools import product
from scipy.stats.mstats import winsorize
from joblib import Parallel, delayed, dump, load  # Import joblib, dump, load
import time
from tqdm import tqdm
import traceback #import traceback

In [2]:
import pmdarima as pm  # Import pmdarima

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## --- Functions ---

### 1. Load Data Harga Pangan (Train, Test, Sample Submission)

In [None]:
def load_and_process_price_data(folder_path):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            temp_df = pd.read_csv(file_path, thousands=',', decimal='.')
            temp_df['Date'] = pd.to_datetime(temp_df['Date'])
            temp_df['item'] = filename.replace(".csv","").replace(" ", "_")
            all_data.append(temp_df)
    return pd.concat(all_data, ignore_index=True)

### 2. Load Data Google Trends

In [None]:
def load_google_trends(main_folder):
    all_trends = {}
    for item_folder in os.listdir(main_folder):
        item_path = os.path.join(main_folder, item_folder)
        if os.path.isdir(item_path):
            item_name = item_folder.replace(" ", "_")
            all_trends[item_name] = []
            for province_file in os.listdir(item_path):
                if province_file.endswith(".csv"):
                    file_path = os.path.join(item_path, province_file)
                    trend_df = pd.read_csv(file_path, thousands=',', decimal='.')
                    if 'Date' in trend_df.columns:
                        trend_df['date'] = pd.to_datetime(trend_df['Date'])
                        trend_df.drop(columns=['Date'], inplace=True)
                    else:
                        print(f"Warning: File {file_path} missing 'Date' column.")
                        continue
                    trend_df.rename(columns={trend_df.columns[0]: item_name}, inplace=True)
                    province_name = province_file.replace(".csv", "").replace(" ", "_")
                    trend_df['provinsi'] = province_name
                    all_trends[item_name].append(trend_df)
    return all_trends

### 3. Load Data Komoditas Global

In [None]:
def load_commodity_data(folder_path):
   all_commodities = {}
   for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            commodity_df = pd.read_csv(file_path, thousands=',', decimal='.')
            commodity_name = filename.replace(" Historical Data.csv", "").replace(" ", "_").replace("#", "num")
            commodity_df['Date'] = pd.to_datetime(commodity_df['Date'])
            commodity_df = commodity_df.rename(columns={'Date': 'date'})
            all_commodities[commodity_name] = commodity_df
   return all_commodities

### 4. Load Data Mata Uang

In [None]:
def load_currency_data(folder_path):
  all_currencies = {}
  for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            curr_df = pd.read_csv(file_path, thousands=',', decimal='.')
            currency_code = filename.replace("=X.csv", "")
            curr_df['Date'] = pd.to_datetime(curr_df['Date'])
            curr_df = curr_df.rename(columns={'Date': 'date', 'Close': currency_code})
            curr_df = curr_df[['date',currency_code]]
            all_currencies[currency_code] = curr_df

  return all_currencies

### Preprocess1

In [None]:
def preprocess_main_data(df):
    df[['item', 'provinsi', 'date']] = df['id'].str.split('/', expand=True)
    df['date'] = pd.to_datetime(df['date'])
    return df

In [None]:
def reshape_data(df):
    df = df.melt(id_vars=['Date', 'item'], var_name='provinsi', value_name='price')
    df = df.rename(columns={'Date':'date'})
    df['id'] = df['item'] + '/' + df['provinsi'] + '/' + df['date'].dt.strftime('%Y-%m-%d')
    return df

### 1. FE Data Utama

In [None]:
def create_date_features(df):
    df['year'] = df['date'].dt.year.astype(int)
    df['month'] = df['date'].dt.month.astype(int)
    df['day'] = df['date'].dt.day.astype(int)
    df['dayofweek'] = df['date'].dt.dayofweek.astype(int)
    df['dayofyear'] = df['date'].dt.dayofyear.astype(int)
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter.astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['date'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['date'].dt.is_quarter_end.astype(int)
    df['is_year_start'] = df['date'].dt.is_year_start.astype(int)
    df['is_year_end'] = df['date'].dt.is_year_end.astype(int)
    return df

In [None]:
def create_lag_agg_features(df, lags, windows, group_cols, target_col):
    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = df.groupby(group_cols)[target_col].shift(lag)
        df[f'{target_col}_lag_{lag}_diff'] = df.groupby(group_cols)[target_col].diff(lag)
    for window in windows:
        df[f'{target_col}_mean_{window}'] = df.groupby(group_cols)[target_col].transform(lambda x: x.rolling(window=window).mean())
        df[f'{target_col}_std_{window}'] = df.groupby(group_cols)[target_col].transform(lambda x: x.rolling(window=window).std())
        df[f'{target_col}_mean_{window}_diff'] = df.groupby(group_cols)[f'{target_col}_lag_1_diff'].transform(lambda x: x.rolling(window=window).mean())
        df[f'{target_col}_std_{window}_diff'] = df.groupby(group_cols)[f'{target_col}_lag_1_diff'].transform(lambda x: x.rolling(window=window).std())
    return df

### 2. FE Google Trends

In [None]:
def process_google_trends(df):
    df = df.copy()
    # Feature Engineering: Lag Features (Tambahkan lag yang lebih panjang)
    df = create_lag_agg_features(df, lags=[1, 7, 30, 90, 180, 365], windows=[3, 7, 14], group_cols=['provinsi'], target_col=df.columns[1])

    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['date'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['date'].dt.is_quarter_end.astype(int)
    df['is_year_start'] = df['date'].dt.is_year_start.astype(int)
    df['is_year_end'] = df['date'].dt.is_year_end.astype(int)

    holidays = ['2022-01-01', '2022-02-01', '2022-02-28', '2022-03-03', '2022-04-15', '2022-04-29',  '2022-05-01', '2022-05-02','2022-05-03',
            '2022-05-04','2022-05-05','2022-05-06','2022-05-16', '2022-05-26', '2022-06-01', '2022-07-09', '2022-07-30', '2022-08-17','2022-10-08', '2022-12-25',
            '2023-01-01', '2023-01-22','2023-01-23', '2023-02-18', '2023-03-22', '2023-03-23', '2023-04-07', '2023-04-19','2023-04-20','2023-04-21',
            '2023-04-22', '2023-04-23','2023-04-24', '2023-04-25','2023-05-01','2023-05-18', '2023-06-01','2023-06-02','2023-06-04',
            '2023-06-28','2023-06-29','2023-06-30', '2023-07-19', '2023-08-17','2023-09-28', '2023-12-25','2023-12-26',
            '2024-01-01', '2024-02-08', '2024-02-09', '2024-02-10', '2024-03-11','2024-03-12', '2024-03-29','2024-03-31', '2024-04-08', '2024-04-09', '2024-04-10',
            '2024-04-11','2024-04-12','2024-04-15', '2024-05-01', '2024-05-09', '2024-05-10', '2024-05-23', '2024-05-24', '2024-06-01', '2024-06-17', '2024-06-18',
            '2024-07-07','2024-08-17','2024-09-16']
    df['is_holiday'] = df['date'].isin(holidays).astype(int)

    columns_to_impute = [col for col in df.columns if 'lag' in col or 'rolling' in col]
    for col in columns_to_impute:
        if df[col].isnull().any():
            df[col] = df.groupby('provinsi')[col].transform(lambda x: x.fillna(x.median()))
    return df

### 3. FE Data Mata Uang

In [None]:
def fe_currency_data(currency_data):
    processed_currency = {}
    for currency, curr_df in currency_data.items():
        curr_df[f'{currency}_lag1'] = curr_df[currency].shift(1)
        curr_df[f'{currency}_mean7'] = curr_df[currency].rolling(window=7).mean()
        curr_df[currency] = winsorize(curr_df[currency], limits=[0.05, 0.05])
        processed_currency[currency] = curr_df
    return processed_currency

### 4. FE Data Komoditas

In [None]:
def fe_commodity_data(commodity_data):
    processed_commodity = {}
    for commodity_name, commodity_df in commodity_data.items():
        commodity_df.drop(columns=['Open', 'High', 'Low', 'Vol.', 'Change %'], inplace=True, errors='ignore')
        commodity_df = commodity_df.rename(columns={'Price': commodity_name})
        commodity_df[commodity_name] = winsorize(commodity_df[commodity_name], limits=[0.05, 0.05])
        commodity_df[f'{commodity_name}_lag1'] = commodity_df[commodity_name].shift(1)
        commodity_df[f'{commodity_name}_mean7'] = commodity_df[commodity_name].rolling(window=7).mean()
        commodity_df[f'{commodity_name}_std7'] = commodity_df[commodity_name].rolling(window=7).std()
        processed_commodity[commodity_name] = commodity_df
    return processed_commodity

### 5. Function Lainya

In [None]:
def create_interactions(df, all_trends):
  df = df.copy()
  for item_name in all_trends.keys():
      if item_name in df.columns:
          df[f'{item_name}_x_USDIDR'] = df[item_name] * df['currency_USDIDR']
  return df

In [None]:
def merge_all_data(main_df, all_trends, currency_data, commodity_data):
    merged_df = main_df.copy()

    for item_name, trend_df in all_trends.items():
        merged_df = pd.merge(merged_df, trend_df, how='left', on=['date', 'provinsi'])

    for currency, curr_df in currency_data.items():
        if 'date' in curr_df.columns:
            merged_df = pd.merge(merged_df, curr_df, on='date', how='left')
        else:
            print(f"Warning: Kolom 'date' tidak ditemukan di dataframe untuk {currency}.")

    for commodity_name, commodity_df in commodity_data.items():
        if 'date' in commodity_df.columns:
             merged_df = pd.merge(merged_df, commodity_df, on='date', how='left')
        else:
            print(f"Warning: Kolom 'date' tidak ditemukan di dataframe untuk {commodity_name}.")

    # Imputasi sebelum SimpleImputer
    for col in merged_df.columns:
        if merged_df[col].isnull().any():
            if col in all_trends.keys():
              merged_df[col] = merged_df[col].fillna(0)
            elif 'currency' in col or 'lag' in col or 'mean' in col or 'std' in col:
              merged_df[col] = merged_df.groupby(['item', 'provinsi'])[col].ffill()
            elif col in commodity_data.keys():
              merged_df[col] = merged_df[col].ffill()
    return merged_df

In [None]:
def difference(series, d=1):
    """Applies differencing to a Pandas Series."""
    return series.diff(periods=d)

In [None]:
def inverse_difference(first_value, forecast):
    """Inverse differencing for a single forecast value.  Removed 'series'."""
    return first_value + forecast

## --- Load Data ---

In [None]:
train_folder = r'c:\Users\mikae\OneDrive\Documents\Lomba\DataVidia\Penyisihan\Harga Bahan Pangan\train'
train_data = load_and_process_price_data(train_folder)
train_df = train_data.copy()

In [None]:
test_folder = r'c:\Users\mikae\OneDrive\Documents\Lomba\DataVidia\Penyisihan\Harga Bahan Pangan\test'
test_data = load_and_process_price_data(test_folder)
test_df = test_data.copy()

In [None]:
google_trends_folder = r'C:\Users\mikae\OneDrive\Documents\Lomba\DataVidia\Penyisihan\Google Trends Merged' #corrected folder
all_trends = load_google_trends(google_trends_folder)

In [None]:
commodity_data = load_commodity_data(r'C:\Users\mikae\OneDrive\Documents\Lomba\DataVidia\Penyisihan\Global Commodity Price')
currency_data = load_currency_data(r'C:\Users\mikae\OneDrive\Documents\Lomba\DataVidia\Penyisihan\Mata Uang')

## --- Preprocessing & Feature Engineering ---

In [None]:
train_df = reshape_data(train_df)
test_df = reshape_data(test_df)
test_df = preprocess_main_data(test_df)

In [None]:
train_df = create_date_features(train_df)
test_df = create_date_features(test_df)

In [None]:
# Label Encoding
label_encoders = {}
for col in ['item', 'provinsi']:
    label_encoders[col] = LabelEncoder()
    combined_data = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    label_encoders[col].fit(combined_data)
    train_df[col] = label_encoders[col].transform(train_df[col].astype(str))
    test_df[col] = label_encoders[col].transform(test_df[col].astype(str))

In [None]:
# Log Transform (before lag features)
train_df['price'] = np.log1p(train_df['price'])
#test_df TIDAK ditransformasi

In [None]:
# Lag and Agg Features (before merging)
train_df = create_lag_agg_features(train_df, lags=[1, 2, 3, 7, 14, 30], windows=[3, 7, 14], group_cols=['item', 'provinsi'], target_col='price')
test_df = create_lag_agg_features(test_df, lags=[1, 2, 3, 7, 14, 30], windows=[3, 7, 14], group_cols=['item', 'provinsi'], target_col='price')

In [None]:
# Process Google Trends
for item_name, trends_list in all_trends.items():
    processed_list = []
    for trend_df in trends_list:
        processed_df = process_google_trends(trend_df)
        processed_list.append(processed_df)
    all_trends[item_name] = pd.concat(processed_list, ignore_index=True)

In [None]:
# Feature Engineering for Currency and Commodity Data
currency_data = fe_currency_data(currency_data)
commodity_data = fe_commodity_data(commodity_data)

In [None]:
# Merge Data
train_df = merge_all_data(train_df, all_trends, currency_data, commodity_data)
test_df = merge_all_data(test_df, all_trends, currency_data, commodity_data)

In [None]:
# Create Interactions (after merging, before imputation)
train_df = create_interactions(train_df, all_trends)
test_df = create_interactions(test_df, all_trends)

## --- SARIMA Modelling ---

In [None]:
# --- SARIMA Modeling (Simplified and Corrected) ---

# 1. Prepare the data *BEFORE* the loop

train_df_sarima = train_df.copy()

# Set 'date' as index *temporarily*, for reindexing.
train_df_sarima = train_df_sarima.set_index('date')

# Create a complete date range for the ENTIRE training set
full_date_range = pd.date_range(start=train_df_sarima.index.min(), end=train_df_sarima.index.max(), freq='D')


# 2. Reindex, Fill, and Difference *BEFORE* the loop
# Group first, *then* apply the reindexing WITHIN each group.
# KEY CHANGE: group_keys=False
train_df_sarima = (
    train_df_sarima.groupby(['item', 'provinsi'], group_keys=False)  # Add group_keys=False
    .apply(lambda x: x.reindex(full_date_range))
)
# 3. Forward-fill within each group (after reindexing!)
train_df_sarima = train_df_sarima.ffill() #ffill directly

# CRUCIAL FIX:  Bring 'date' back as a regular column *before* differencing.
train_df_sarima = train_df_sarima.reset_index() #reset all index


# 4. *Now* do differencing (after reindexing and filling)
train_df_sarima['price_diff'] = train_df_sarima.groupby(['item', 'provinsi'])['price'].diff()


# 5. drop NaN values that result from differencing (at beginning of each group)
train_df_sarima.dropna(subset=['price_diff'], inplace=True)
# train_df_sarima = train_df_sarima.reset_index() # Completely reset the index
train_df_sarima = train_df_sarima.set_index(['item', 'provinsi', 'date']) # Now* set the multi-index

In [None]:
# --- Parameter Tuning Function ---
def train_sarima(train_data, order, seasonal_order):
    try:
        model = SARIMAX(train_data, order=order, seasonal_order=seasonal_order,
                       enforce_stationarity=False, enforce_invertibility=False)
        model_fit = model.fit(disp=False) # Remove method
        return model_fit
    except Exception as e:
        print(f"SARIMA training failed with order={order}, seasonal_order={seasonal_order}. Error: {e}") # Debugging
        return None

In [None]:
# --- Parallel Loop with tqdm and Memory Mapping ---
from tqdm import tqdm
from joblib import dump, load  # Import for memory mapping

# Create a memory-mapped array for train_df_sarima
dump(train_df_sarima, 'train_df_sarima.joblib')
train_df_sarima_memmap = load('train_df_sarima.joblib', mmap_mode='r')

# --- PARALLEL PROCESSING ---
def tune_sarima_for_group(group_data):  # Simplified arguments
    item_id = group_data.index.get_level_values('item')[0]
    provinsi_id = group_data.index.get_level_values('provinsi')[0]
    #print(f"Tuning for item={item_id}, provinsi={provinsi_id}")

    best_rmse = float('inf')
    best_order = None
    best_seasonal_order = None
    results = []

    train_subset = group_data['price_diff']  # Already reindexed, filled, and differenced
    # print(train_subset) #debugging

    if train_subset.empty or len(train_subset) < 10:  # Check for enough data.
      #print(f"Skipping item={item_id}, provinsi={provinsi_id} due to insufficient data.")
      return None, (item_id, provinsi_id), None  # Return None if not enough data

    if group_data.index.names[2] is not None: #check date index name
        if group_data.index.get_level_values('date').inferred_freq != 'D': #check date inferred_freq
          print(f"Warning: Could not set daily frequency for item={item_id}, provinsi={provinsi_id}") # Debugging
          return None, (item_id, provinsi_id), None #skip

    # Parameter Grid - keep it reasonable
    p = [0, 1, 2]  # Example: 0, 1, 2  -- REDUCED
    d = [1]          # Fix d=1
    q = [0, 1, 2]  # 0, 1, 2  -- REDUCED
    P = [0, 1]  # 0, 1     -- REDUCED
    D = [1]          # Fix D=1
    Q = [0, 1]  # 0, 1     -- REDUCED
    s = 7            # Weekly seasonality
    pdq = list(product(p, d, q))
    seasonal_pdq = [(x[0], x[1], x[2], s) for x in list(product(P, D, Q))]


    for order in pdq:
        for seasonal_order in seasonal_pdq:
            model_fit = train_sarima(train_subset, order, seasonal_order)
            if model_fit is None:
                continue

            # Use SARIMAXResults.predict (much faster)
            predictions_diff = model_fit.predict()
            #print(predictions_diff) #debugging

            # Inverse difference.  MUCH SIMPLER NOW.
            original_prices = group_data['price'].dropna() #keep index

            # Use .values to get a NumPy array, and slicing, and addition:
            predictions = original_prices.iloc[0] + np.cumsum(predictions_diff)

            # Align for RMSE
            original_prices_aligned = original_prices.iloc[1:]
            predictions_aligned = predictions[:len(original_prices_aligned)]

            #check length
            if len(original_prices_aligned) != len(predictions_aligned):
                # print(f"Skipping due to length mismatch: original={len(original_prices_aligned)}, predicted={len(predictions_aligned)}")  # Debug
                continue  # Skip to the next parameter set.  VERY IMPORTANT.


            # Check for NaNs *before* RMSE calculation
            if np.isnan(original_prices_aligned).any() or np.isnan(predictions_aligned).any():
                #print(f"NaNs found in item={item_id}, prov={provinsi_id}, order={order}, sorder={seasonal_order}.  Skipping.")
                continue

            rmse = np.sqrt(mean_squared_error(original_prices_aligned, predictions_aligned))
            mape = mean_absolute_percentage_error(original_prices_aligned, predictions_aligned)
            #print(f"item={item_id}, provinsi={provinsi_id}, order={order}, seasonal_order={seasonal_order}, RMSE: {rmse}") # Debugging output

            results.append({
                'item': item_id,
                'provinsi': provinsi_id,
                'order': order,
                'seasonal_order': seasonal_order,
                'rmse': rmse,
                'mape': mape
            })

            if rmse < best_rmse:
                best_rmse = rmse
                best_order = order
                best_seasonal_order = seasonal_order
                best_model = model_fit

    return results, (item_id, provinsi_id), best_model

# Wrap the loop with tqdm for a progress bar:
with tqdm(total=len(list(train_df_sarima_memmap.groupby(['item', 'provinsi'])))) as pbar:
    results = Parallel(n_jobs=-1)(
        delayed(tune_sarima_for_group)(group_data)
        for (item_id, provinsi_id), group_data in train_df_sarima_memmap.groupby(['item', 'provinsi'])
    )
    pbar.update(1)

# --- Collect Results ---
# Process results from parallel execution
all_results = []
best_models = {}
for result_list, (item_id, provinsi_id), best_model in results:
    if result_list is not None:  # Handle cases where a group might return None
        all_results.extend(result_list)  # Collect all results
    if best_model is not None: #check for the best model
        best_models[(item_id, provinsi_id)] = best_model  # Store best models
results_df = pd.DataFrame(all_results)
print(results_df.sort_values(by='rmse').head())

## --- Prediction on Test Data ---

In [None]:
test_df_sarima = test_df.copy()
test_df_sarima = test_df_sarima.set_index('date')

In [None]:
final_predictions = []

for item_id in test_df_sarima['item'].unique():
    for provinsi_id in test_df_sarima['provinsi'].unique():
        train_subset = train_df_sarima[(train_df_sarima.index.get_level_values('item') == item_id) & (train_df_sarima.index.get_level_values('provinsi') == provinsi_id)]
        test_subset = test_df_sarima[(test_df_sarima['item'] == item_id) & (test_df_sarima['provinsi'] == provinsi_id)]

        #reindex test data:
        test_subset = test_subset.reindex(pd.date_range(start=test_subset.index.min(), end=test_subset.index.max(), freq='D'))
        test_subset['item'] = item_id
        test_subset['provinsi'] = provinsi_id


        # Check if we have a best model for this (item, provinsi)
        if (item_id, provinsi_id) not in best_models:
            # print(f"Skipping item={item_id}, provinsi={provinsi_id} due to no best model.") # Debugging
            final_predictions.extend([np.nan] * len(test_subset))  # Fill with NaN if no model
            continue

        # Fit the final model on the *entire* training subset (including differencing!)
        final_train_diff = train_subset['price_diff'].dropna()  # Drop NaNs *before* training
        # Ensure we have consistent frequency
        #final_train_diff = final_train_diff.asfreq('D')  # No Longer Needed
        if final_train_diff.index.inferred_freq != 'D':
           print(f"Skipping item={item_id}, provinsi={provinsi_id}: Could not set freq for final model") # Debugging
           final_predictions.extend([np.nan] * len(test_subset)) #keep length consistent
           continue
        
        if len(final_train_diff) < 2:
          final_predictions.extend([np.nan] * len(test_subset))
          continue


        final_model = SARIMAX(final_train_diff, order=best_models[(item_id, provinsi_id)].model.order,
                              seasonal_order=best_models[(item_id, provinsi_id)].model.seasonal_order,
                              enforce_stationarity=False, enforce_invertibility=False)
        final_model_fit = final_model.fit(disp=False, maxiter=1000)

        # Forecast on the *differenced* data.
        # Use get_prediction and .predicted_mean for efficiency
        test_predictions_diff = final_model_fit.get_prediction(start=len(final_train_diff), end=len(final_train_diff) + len(test_subset) - 1).predicted_mean

        # Inverse difference the predictions, using the *last* value from the *original* training data
        last_train_value = train_subset['price'].dropna().iloc[-1]   # Get *last* original value

        test_predictions = []
        for i in range(len(test_predictions_diff)):
            if i == 0:
              pred_value = inverse_difference(last_train_value, test_predictions_diff.iloc[i])  # First prediction
            else:
              pred_value = inverse_difference(test_predictions[i-1], test_predictions_diff.iloc[i])  # Subsequent predictions
            test_predictions.append(pred_value)

        final_predictions.extend(test_predictions)


test_df_sarima['price'] = final_predictions  # Add predictions to DataFrame

In [None]:
# Inverse Log Transform
test_df_sarima['price'] = np.expm1(test_df_sarima['price'])  # INVERSE TRANSFORM
submission_sarima = test_df_sarima[['id', 'price']]
submission_sarima.to_csv('submission_sarima.csv', index=False)
print("File submission_sarima.csv telah dibuat.")