In [7]:
!pip install dagshub mlflow neuralforecast --quiet

import warnings
from statsmodels.tools.sm_exceptions import ValueWarning

warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore")

print("Done!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.8/285.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [9]:
import mlflow.sklearn
from datetime import datetime
import joblib
import dagshub
import mlflow
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from statsmodels.tsa.arima.model import ARIMA
import os
from neuralforecast.models import DLinear
from neuralforecast import NeuralForecast

dagshub.init(repo_owner='gnada22', repo_name='ml_final_project', mlflow=True)

In [10]:
# class definitions

class DateFeatureCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["week"] = (
            X["Date"].dt.to_period("W")
            .rank(method="dense")
            .astype(int) - 1
        )
        X["sin_13"] = np.sin(2 * np.pi * X["week"] / 13)
        X["cos_13"] = np.cos(2 * np.pi * X["week"] / 13)
        X["sin_23"] = np.sin(2 * np.pi * X["week"] / 23)
        X["cos_23"] = np.cos(2 * np.pi * X["week"] / 23)
        X = X.drop(columns=["Date"])
        return X

date_features = ["week", "sin_13", "cos_13", "sin_23", "cos_23"]

class LagFeatureAdder:
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.sort_values(["Store", "Dept", "Date"])
        df["lag_1"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].shift(1)
        df["lag_52"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].shift(52)
        return df

lag_features = ["lag_1", "lag_52"]

added_features = date_features + lag_features

class ColumnTransformerWithNames(ColumnTransformer):
    def get_feature_names_out(self, input_features=None):
        return super().get_feature_names_out(input_features)

    def transform(self, X):
        X_transformed = super().transform(X)
        # Get feature names for columns
        cols = self.get_feature_names_out()
        cols = [c.split("__", 1)[-1] for c in self.get_feature_names_out()]
        res = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        # print("with name transform - ", type(res))
        return res

    def fit_transform(self, X, y=None):
        X_transformed = super().fit_transform(X, y)
        cols = self.get_feature_names_out()
        cols = [c.split("__", 1)[-1] for c in self.get_feature_names_out()]
        res = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        # print("with name fit_transform - ", type(res))
        return res

class MultiIndexKeeper(BaseEstimator, TransformerMixin):
    def __init__(self, index_cols=["Date", "Store", "Dept"]):
        self.index_cols = index_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X.set_index(self.index_cols, drop=False, inplace=True)
        return X

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")

def extract_onehot_value(row, prefix, default=1):
    for col in row.index:
        if col.startswith(prefix) and row[col] == 1:
            return int(col.split("_")[1])
    return default

class ResidualRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_model, residual_model):
        self.base_model = base_model
        self.residual_model = residual_model

    def fit(self, X, y):
        self.base_model_ = clone(self.base_model)
        self.base_model_.fit(X, y)
        residuals = y - self.base_model_.predict(X)
        
        self.residual_model_ = clone(self.residual_model)
        self.residual_model_.fit(X, residuals)
        return self

    def extract_onehot_value(self, row, prefix, default=1):
        for col in row.index:
            if col.startswith(prefix) and row[col] == 1:
                return int(col.split("_")[1])
        return default
    
    def predict(self, X):
        pred_lookup = {}
        weekly_preds = pd.Series(index=X.index, dtype=float)
        
        for week in tqdm(X["week"].sort_values().unique(), desc="Recursive prediction"):
            day_rows = X[X["week"] == week].copy()
        
            for idx, row in day_rows.iterrows():
                store = extract_onehot_value(row, "Store_")
                dept = extract_onehot_value(row, "Dept_")
        
                # Get keys for previous lags
                key_1 = (store, dept, week - 1)
                key_52 = (store, dept, week - 52)
        
                lag_1 = pred_lookup.get(key_1, row["lag_1"])
                lag_52 = pred_lookup.get(key_52, row["lag_52"])
        
                day_rows.at[idx, "lag_1"] = lag_1
                day_rows.at[idx, "lag_52"] = lag_52
        
            # Predict all rows for this day in one batch
            y_preds = self.pred_f(day_rows)
        
            # Assign predictions back
            weekly_preds[day_rows.index] = y_preds
            
            # Update lookup for future lag access
            for idx, pred in zip(day_rows.index, y_preds):
                row = day_rows.loc[idx]
                
                store = extract_onehot_value(row, "Store_")
                dept = extract_onehot_value(row, "Dept_")
                
                key = (store, dept, row["week"])
                pred_lookup[key] = pred
    
        return weekly_preds.fillna(0).to_numpy()

    def pred_f(self, X):
        return self.base_model_.predict(X) + self.residual_model_.predict(X)

class ARIMARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, order=(1, 0, 0), store_level='Store', dept_level='Dept'):
        self.order = order
        self.store_level = store_level
        self.dept_level = dept_level

    def fit(self, X, y):
        if not isinstance(X.index, pd.MultiIndex):
            raise ValueError("X must have a MultiIndex")

        self.models_ = {}
        self.avgs_ = {}

        df = X.copy()
        df["target"] = y.values

        grouped = df.groupby(level=[self.store_level, self.dept_level])

        for (store, dept), group_df in grouped:
            if dept == 1:
                print("store: ", store)
    
            ts = group_df["target"].copy()
            exog = group_df.drop(columns=["target"])
        
            try:
                model = ARIMA(endog=ts, order=self.order).fit()
                self.models_[(store, dept)] = model
            except Exception as e:
                # Skip problematic groups
                print(f"Skipping ({store}, {dept}) due to error: {e}")
                self.skip_(ts, store, dept)
                continue

        return self

    def skip_(self, ts, store, dept):
        if ts is None or len(ts) == 0:
            self.avgs_[(store, dept)] = 0.0
        else:
            self.avgs_[(store, dept)] = ts.mean()

    def predict(self, X):
        if not isinstance(X.index, pd.MultiIndex):
            raise ValueError("X must have a MultiIndex")
    
        preds = pd.Series(index=X.index, dtype=float)
    
        # Group X by store-dept pair (based on index levels)
        grouped = X.groupby(level=[self.store_level, self.dept_level])
    
        for (store, dept), group_df in grouped:
            if dept == 1:
                print("store: ", store)
            model = self.models_.get((store, dept))
            if model is None:
                preds.loc[group_df.index] = self.avgs_.get((store, dept), 0)
                continue

            # exog = group_df.copy()
    
            # Forecast N steps = number of rows in this group
            forecast = model.forecast(steps=len(group_df))
            preds.loc[group_df.index] = forecast.to_numpy()
    
        return preds.to_numpy()

class DLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_chunk_length=52, output_chunk_length=39, epochs=10, batch_size=32):
        self.input_chunk_length = input_chunk_length
        self.output_chunk_length = output_chunk_length
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self, X, y):
        df = X.copy()
        df["y"] = y.values

        if not isinstance(df.index, pd.MultiIndex):
            raise ValueError("X must have a MultiIndex")

        df = df.reset_index()
        df.rename(columns={"Date": "ds"}, inplace=True)
        df["unique_id"] = df["Store"].astype(str) + "_" + df["Dept"].astype(str)

        self.train_df_ = df[["unique_id", "ds", "y"]].copy()

        # num_samples = len(self.train_df_)
        # max_steps = int((num_samples / self.batch_size) * self.epochs)

        model = DLinear(
            input_size=self.input_chunk_length,
            h=self.output_chunk_length,
            max_steps=self.epochs * 104,
            batch_size=self.batch_size,
            random_seed=42
        )

        self.nf_ = NeuralForecast(models=[model], freq="W-FRI")
        self.nf_.fit(df=self.train_df_)
        return self

    def predict(self, X):
        df = X.reset_index()
        df.rename(columns={"Date": "ds"}, inplace=True)
        df["unique_id"] = df["Store"].astype(str) + "_" + df["Dept"].astype(str)

        forecast_df = self.nf_.predict()
        forecast_df = forecast_df.rename(columns={"DLinear": "y_hat"})

        merged = df.merge(forecast_df, on=["unique_id", "ds"], how="left")
        preds = pd.Series(data=merged["y_hat"].fillna(0).values, index=X.index)

        return preds.to_numpy()


In [11]:
local_path = mlflow.artifacts.download_artifacts(
    artifact_uri="mlflow-artifacts:/e83ef275cec24f3193fe6da7bfedd8b9/d7b6439d6da64738a3b6508250ca6594/artifacts/model.pkl"
)

print("Downloaded file size:", os.path.getsize(local_path) / (1024 ** 2), "MB")

model = joblib.load(local_path)

print("Done!")

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded file size: 16.14536762237549 MB
Done!


In [12]:
# load and add lag features

test = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip", parse_dates=["Date"])
features = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip", parse_dates=["Date"])
stores = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv")

df = test.merge(features, on=["Store", "Date", "IsHoliday"], how="left")
df = df.merge(stores, on="Store", how="left")

def add_lag_features(df):
    train = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip", parse_dates=["Date"])
    train = train[["Store", "Dept", "Date", "Weekly_Sales"]]
    train = train.sort_values(["Store", "Dept", "Date"])
    
    full = pd.concat([train, df], axis=0)
    full = full.sort_values(["Store", "Dept", "Date"])
    
    full["lag_1"] = full.groupby(["Store", "Dept"])["Weekly_Sales"].shift(1)
    full["lag_52"] = full.groupby(["Store", "Dept"])["Weekly_Sales"].shift(52)

    res = full[full["Weekly_Sales"].isna()].copy()

    return res

df = add_lag_features(df)

X_test = df.drop(columns=["Weekly_Sales"], errors="ignore")

print("Done!")

Done!


In [13]:
preds = model.predict(X_test)

submission = pd.DataFrame()
submission["Weekly_Sales"] = preds
submission["Id"] = X_test["Store"].astype(str) + "_" + X_test["Dept"].astype(str) + "_" + X_test["Date"].dt.strftime("%Y-%m-%d")
# print(submission)
submission.to_csv("submission.csv", index=False)
print("✅ Submission saved as submission.csv")

2025-07-04 18:44:04.899042: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751654645.123840      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751654645.191016      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Predicting: |          | 0/? [00:00<?, ?it/s]

✅ Submission saved as submission.csv
