In [57]:
try:
  import pandas as pd
  import numpy as np
  import os
  import joblib
  import lightgbm as lgb
  import matplotlib.pyplot as plt
  import seaborn as sns
  from lightgbm import early_stopping
  from sklearn.ensemble import RandomForestRegressor
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
  from tqdm import tqdm
except Exception as e:
  print(e)

No module named 'matplotlib'


In [58]:
df = pd.read_csv("train (1).csv")

In [59]:
df.head(1)

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13


In [60]:
# Pre-processing
def load_and_preprocess(df):

    df['date'] = pd.to_datetime(df['date'])
    df['sales'] = df['sales'].fillna(0)
    df = df.sort_values(by=['store', 'item', 'date']).reset_index(drop=True)
    return df

In [61]:
def add_date_features(df):
    # 🔍 Ensure 'date' column is in datetime format (in-place and safe)
    if not pd.api.types.is_datetime64_any_dtype(df['date']):
        df['date'] = pd.to_datetime(df['date'], errors='coerce')  # invalid parsing becomes NaT
    
    # ✅ Drop rows where date conversion failed
    df = df.dropna(subset=['date'])

    # Now safely extract features
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['day_of_week'] = df['date'].dt.dayofweek
    df['weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['quarter'] = df['date'].dt.quarter

    return df


In [62]:
# Feature Engineering
def add_sales_timeseries_metrics(df):
    df['sales_1_day_ago'] = df.groupby(['store', 'item'])['sales'].shift(1)
    df['sales_7_days_ago'] = df.groupby(['store', 'item'])['sales'].shift(7)
    df['rolling_14_day_sales'] = df.groupby(['store', 'item'])['sales'].rolling(window=14, min_periods=1).sum().reset_index(drop=True)
    df['rolling_28_day_sales'] = df.groupby(['store', 'item'])['sales'].rolling(window=28, min_periods=1).sum().reset_index(drop=True)
    df['mean_sales'] = df.groupby(['store', 'item'])['sales'].expanding().mean().reset_index(drop=True)
    return df



In [63]:
# Target variables
def create_targets(df):
    df['target_7_day_sales'] = df.groupby(['store', 'item'])['sales'].shift(-1).rolling(window=7, min_periods=1).sum().reset_index(drop=True)
    df['target_30_day_sales'] = df.groupby(['store', 'item'])['sales'].shift(-1).rolling(window=30, min_periods=1).sum().reset_index(drop=True)
    return df

In [64]:
def train_and_save_models_from_df(df, model_dir='models_30', target_col='target_30_day_sales'):
    os.makedirs(model_dir, exist_ok=True)

    # Add features
    df = add_sales_timeseries_metrics(df)
    df = add_date_features(df)
    df = create_targets(df)

    features = [
        'sales_1_day_ago', 'sales_7_days_ago', 'rolling_14_day_sales',
        'rolling_28_day_sales', 'mean_sales', 'day', 'month', 'year',
        'day_of_week', 'weekend', 'quarter'
    ]

    grouped = df.groupby(['store', 'item'])

    for (store, item), group in grouped:
        group = group.dropna(subset=features + [target_col])
        if group.shape[0] < 30:
            print(f"Skipping Store {store}, Item {item} — not enough data")
            continue

        X = group[features]
        y = group[target_col]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X, y)

        model_path = os.path.join(model_dir, f"{store}_{item}_{target_col}.pkl")
        joblib.dump(model, model_path)
        print(f"✅ Model saved: {model_path}")

In [65]:
from sklearn.ensemble import RandomForestRegressor


In [66]:
train_and_save_models_from_df(df)


✅ Model saved: models_30\1_1_target_30_day_sales.pkl
✅ Model saved: models_30\1_2_target_30_day_sales.pkl
✅ Model saved: models_30\1_3_target_30_day_sales.pkl
✅ Model saved: models_30\1_4_target_30_day_sales.pkl
✅ Model saved: models_30\1_5_target_30_day_sales.pkl
✅ Model saved: models_30\1_6_target_30_day_sales.pkl
✅ Model saved: models_30\1_7_target_30_day_sales.pkl
✅ Model saved: models_30\1_8_target_30_day_sales.pkl
✅ Model saved: models_30\1_9_target_30_day_sales.pkl
✅ Model saved: models_30\1_10_target_30_day_sales.pkl
✅ Model saved: models_30\1_11_target_30_day_sales.pkl
✅ Model saved: models_30\1_12_target_30_day_sales.pkl
✅ Model saved: models_30\1_13_target_30_day_sales.pkl
✅ Model saved: models_30\1_14_target_30_day_sales.pkl
✅ Model saved: models_30\1_15_target_30_day_sales.pkl
✅ Model saved: models_30\1_16_target_30_day_sales.pkl
✅ Model saved: models_30\1_17_target_30_day_sales.pkl
✅ Model saved: models_30\1_18_target_30_day_sales.pkl
✅ Model saved: models_30\1_19_target_