In [None]:
!pip install dagshub mlflow --quiet

import warnings
warnings.filterwarnings("ignore")

print("Done!")

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [4]:
import mlflow.sklearn
from datetime import datetime
import joblib
import dagshub
import mlflow
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

dagshub.init(repo_owner='gnada22', repo_name='ml_final_project', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=835863de-8c6d-4630-a82f-c7dbad0221e1&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=e204c812341e48cca0d5e56a448298fa4b1078bb32d3be15c78ae2c9fd400ac4




Output()

In [5]:
class DateFeatureCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["weekofyear"] = X["Date"].dt.isocalendar().week.astype(int)
        X["sin_13"] = np.sin(2 * np.pi * X["weekofyear"] / 13)
        X["cos_13"] = np.cos(2 * np.pi * X["weekofyear"] / 13)
        X["sin_23"] = np.sin(2 * np.pi * X["weekofyear"] / 23)
        X["cos_23"] = np.cos(2 * np.pi * X["weekofyear"] / 23)
        X = X.drop(columns=["Date"])
        return X

local_path = mlflow.artifacts.download_artifacts(
    artifact_uri="mlflow-artifacts:/bdaee488c065473c96d054e5ae43539a/abb40eeefeb54f7a87763486cb01f016/artifacts/model.pkl"
)

model = joblib.load(local_path)

print("Done!")

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Done!


In [6]:
# load and add lag features

test = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip", parse_dates=["Date"])
features = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip", parse_dates=["Date"])
stores = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv")

df = test.merge(features, on=["Store", "Date", "IsHoliday"], how="left")
df = df.merge(stores, on="Store", how="left")

def add_lag_features(df):
    train = pd.read_csv("/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip", parse_dates=["Date"])
    train = train[["Store", "Dept", "Date", "Weekly_Sales"]]
    train = train.sort_values(["Store", "Dept", "Date"])
    
    full = pd.concat([train, df], axis=0)
    full = full.sort_values(["Store", "Dept", "Date"])
    
    full["lag_1"] = full.groupby(["Store", "Dept"])["Weekly_Sales"].shift(1)
    full["lag_52"] = full.groupby(["Store", "Dept"])["Weekly_Sales"].shift(52)

    res = full[full["Weekly_Sales"].isna()].copy() # just test rows
    # res.dropna(subset=["lag_1", "lag_52"], inplace=True)  # drop rows with missing lags

    return res

df = add_lag_features(df)

X_test = df.drop(columns=["Weekly_Sales"], errors="ignore")

In [13]:
# preds = model.predict(X_test)

X_test["Weekly_Sales_Predicted"] = np.nan

X_test = X_test.sort_values(["Store", "Dept", "Date"]).reset_index(drop=True)
X_test["orig_index"] = X_test.index

# Group
grouped = X_test.groupby(["Store", "Dept"])

for (store, dept), group in tqdm(grouped, desc="Recursive prediction"):
    group = group.copy()  # no reset_index()

    for i in range(len(group)):
        row = group.iloc[i]

        lag_1 = group.iloc[i - 1]["Weekly_Sales_Predicted"] if i - 1 >= 0 else row["lag_1"]
        lag_52 = group.iloc[i - 52]["Weekly_Sales_Predicted"] if i - 52 >= 0 else row["lag_52"]

        input_row = row.copy()
        input_row["lag_1"] = lag_1
        input_row["lag_52"] = lag_52

        X_input = input_row.to_frame().T
        X_input["Date"] = pd.to_datetime(X_input["Date"])

        y_pred = model.predict(X_input)[0]
        group.iloc[i, group.columns.get_loc("Weekly_Sales_Predicted")] = y_pred

    # Write back using orig_index
    X_test.loc[group["orig_index"], "Weekly_Sales_Predicted"] = group["Weekly_Sales_Predicted"]
    # print(X_test.loc[group["orig_index"], "Weekly_Sales_Predicted"])

df["Weekly_Sales"] = X_test["Weekly_Sales_Predicted"]

df["Id"] = df["Store"].astype(str) + "_" + df["Dept"].astype(str) + "_" + df["Date"].dt.strftime("%Y-%m-%d")
submission = df[["Id", "Weekly_Sales"]]
submission.to_csv("submission.csv", index=False)
print("✅ Submission saved as submission.csv")

Recursive prediction: 100%|██████████| 3169/3169 [18:14<00:00,  2.89it/s]


✅ Submission saved as submission.csv
