<a href="https://colab.research.google.com/github/Edenshmuel/PapaJohns_Data_Science_Project/blob/main/TFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.flush_and_unmount()

In [None]:
# ⬇️ 1. Install & import libraries
!pip install -q pytorch-lightning torchmetrics pytorch-forecasting

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/823.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m819.2/823.0 kB[0m [31m39.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.0/823.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m961.5/961.5 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.7/197.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m818.9/818.9 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from pytorch_forecasting import (
    TimeSeriesDataSet, TemporalFusionTransformer, Baseline,
    QuantileLoss)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Final_Project_PapaJohns/cleaned_data.csv')

In [None]:
desc_map = pd.read_csv('/content/drive/MyDrive/Final_Project_PapaJohns/desc_encoding_map.csv')
cat_map = pd.read_csv('/content/drive/MyDrive/Final_Project_PapaJohns/category_mapping.csv')

In [None]:
df["Date"] = pd.to_datetime(df["Date"])

In [None]:
df = df.rename(columns={"כמות": "quantity"})

In [None]:
# ⬇️ 3. Build a numeric time index (required by PyTorch‑Forecasting)
df = df.sort_values(["clean_desc_encoded", "Date"])
df["time_idx"] = (
    df.groupby("clean_desc_encoded")
      .cumcount())

In [None]:
# ⬇️ 4. Define TFT parameters
max_encoder_length     = 60      # how many historic days the model sees
max_prediction_length  = 30      # horizon (1 week – 1 month, tweak as you like)

In [None]:
training_cutoff = df["time_idx"].max() - max_prediction_length

In [None]:
# ⬇️ 5. Tell the dataset which columns play which role
categorical_static = ["clean_desc_encoded", "category_encoded",
                      "encoded_portion_type"]
categorical_time   = ["Is_Weekend", "Season",
                      "is_christian_holiday", "is_jewish_holiday",
                      "is_near_jewish_holiday", "is_day_before_new_year",
                      "encoded_jewish_holiday", "encoded_christian_holiday"]

In [None]:
# המרה לכל העמודות הקטגוריאליות ל-string ואז ל-category כדי שהמודל יבין שאלו קטגוריות
for col in categorical_static + categorical_time:
    df[col] = df[col].astype(str).astype("category")

In [None]:
# All the real‑valued, time‑varying features
real_time = [
    "Year", "Month", "Day", "WeekOfYear",
    "Day_Name_sin", "Day_Name_cos", "Month_sin", "Month_cos",
    "avg_quantity_all_time", "std_quantity_all_time",
    "num_days_sold", "popularity_score"]

In [None]:
# ⬇️ 6. Wrap everything in a TimeSeriesDataSet
training = TimeSeriesDataSet(
    df[df.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="quantity",
    group_ids=["clean_desc_encoded"],

    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,

    static_categoricals=categorical_static,
    time_varying_known_categoricals=categorical_time,
    time_varying_known_reals=real_time,
    time_varying_unknown_reals=["quantity"],  # the target itself
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,)

validation = TimeSeriesDataSet.from_dataset(
    training, df, min_prediction_idx=training_cutoff+1)

In [None]:
# ⬇️ 7. Dataloaders
batch_size = 128
train_loader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2)
val_loader   = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2)

In [None]:
# ⬇️ 8. Define the model
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate      = 1e-3,
    hidden_size        = 32,
    attention_head_size= 4,
    dropout            = 0.1,
    loss               = QuantileLoss(),
    log_interval       = 10,
    reduce_on_plateau_patience = 4,)

In [None]:
import lightning.pytorch as pl

from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import LearningRateMonitor

In [None]:
# ⬇️ 9. Train
trainer = pl.Trainer(
    max_epochs=30,
    accelerator="auto",
    callbacks=[pl.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")])

trainer.fit(tft, train_loader, val_loader)

In [None]:
# ⬇️ 10. Forecast next 30 days for all items
raw_predictions, x, index = tft.predict(val_loader, mode="raw", return_x=True)

In [None]:
# ⬇️ 11. Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_true = np.concatenate([y[0].numpy() for y in x["decoder_target"]])
y_pred = np.concatenate([p[0].numpy() for p in raw_predictions["prediction"]])

mae  = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2   = r2_score(y_true, y_pred)

print(f"MAE: {mae:.2f} | RMSE: {rmse:.2f} | R²: {r2:.3f}")

In [None]:
# ⬇️ 12. Plot an example item
import matplotlib.pyplot as plt

item_id = df["clean_desc_encoded"].sample(1).iloc[0]
tft.plot_prediction(x, raw_predictions, idx=item_id)
plt.show()

In [None]:
# עותק מהנתונים המקוריים
df = cleaned_data.copy()

In [None]:
from pytorch_forecasting import TimeSeriesDataSet
import pandas as pd

def build_tft_datasets_from_raw(
    df,
    desc_map,
    cat_map,
    target_col="כמות",
    time_idx_col="order",
    group_col="clean_desc_str",
    date_col="Date",
    cutoff_date="2024-01-01",
    encoder_length=30,
    prediction_length=7,
):
    # שלב 1 – מיזוג מיפויים
    desc_map = desc_map.rename(columns={
        "code": "clean_desc_encoded",
        "Unnamed: 0": "clean_desc_str"
    })
    cat_map = cat_map.rename(columns={
        "קוד": "category_encoded",
        "קטגוריה": "category_str"
    })
    df = df.merge(desc_map, on="clean_desc_encoded", how="left")
    df = df.merge(cat_map, on="category_encoded", how="left")

    # שלב 2 – ניקוי והמרת טיפוסים
    df[date_col]           = pd.to_datetime(df[date_col])
    df["clean_desc_str"]   = df["clean_desc_str"].astype(str).fillna("Unknown")
    df["category_str"]     = df["category_str"].astype(str).fillna("Unknown")
    df["portion_type"]     = df["portion_type"].astype(str).fillna("Unknown")
    df[target_col]         = pd.to_numeric(df[target_col], errors="coerce")
    df["time_idx"]         = df[time_idx_col]

    # **_drop all rows with the problematic code 54 right here**
    df = df[df["clean_desc_encoded"] != 54].copy()

    # שלב 3 – פיצול דאטה
    train_df = df[df[date_col] < cutoff_date].copy()
    val_df   = df[df[date_col] >= cutoff_date].copy()

    # שלב 4 – יצירת TimeSeriesDataSet לאימון
    known_reals         = ["time_idx","Month","Day","Day_Name","Is_Weekend",
                           "is_christian_holiday","is_jewish_holiday","is_near_jewish_holiday",
                           "is_day_before_new_year","Season","is_start_of_month","is_end_of_month",
                           "Day_Name_sin","Day_Name_cos","Month_sin","Month_cos"]
    known_categoricals  = ["portion_type"]
    static_categoricals = [group_col, "category_str"]
    observed_reals      = [target_col,"avg_quantity_all_time","std_quantity_all_time",
                           "popularity_score","num_days_sold"]

    training = TimeSeriesDataSet(
        train_df,
        time_idx="time_idx",
        target=target_col,
        group_ids=[group_col],
        max_encoder_length=encoder_length,
        max_prediction_length=prediction_length,
        static_categoricals=static_categoricals,
        time_varying_known_reals=known_reals,
        time_varying_known_categoricals=known_categoricals,
        time_varying_unknown_reals=[target_col] + observed_reals,
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        allow_missing_timesteps=True,
    )

    # ——— הקטע החדש: דיבוג ערכים לא מוכרים ב-val_df ———
    categorical_encoders = training.get_parameters()["categorical_encoders"]
    for col, encoder in categorical_encoders.items():
        if col in val_df.columns:
            val_values = set(val_df[col].unique())
            known      = set(encoder.classes_)
            unknown    = val_values - known
            if unknown:
                print(f"⚠️ עמודה '{col}' מכילה ערכים לא מוכרים: {unknown}")
    # ——————————————————————————————————————————————

    # שלב 5 – סינון ערכים לא חוקיים לפי encoders של המודל
    before = len(val_df)
    for col, encoder in categorical_encoders.items():
        if col in val_df.columns:
            legal = encoder.classes_
            val_df = val_df[val_df[col].isin(legal)]
    after = len(val_df)
    print(f"סוננו {before - after} שורות מ-val_df עם ערכים לא חוקיים בעמודות שקודדו.")

    # שלב 6 – טיפול ידני ב-clean_desc_encoded == 54 אם נשאר
    if 54 in val_df.get("clean_desc_encoded", []):
        val_df = val_df[val_df["clean_desc_encoded"] != 54]
        print("⚠️ הסרנו ידנית את הקוד 54 מ-val_df (clean_desc_encoded).")

    # שלב 7 – יצירת validator dataset
    validation = TimeSeriesDataSet.from_dataset(training, val_df)

    return training, validation

In [None]:
training, validation = build_tft_datasets_from_raw(
    df,
    desc_map,
    cat_map,
    cutoff_date="2024-01-01",
    encoder_length=30,
    prediction_length=7)



⚠️ עמודה 'clean_desc_str' מכילה ערכים לא מוכרים: {'נקניקייה טבעונית', 'פיצה l קלאסית תוספת צמחונית משלוח po', '2 משפחתיות קלאסיות', '3 פיצות ללא גלוטן משפחתיות', '6 גבינות 14 דקה', 'עם אורגנו', 'קלפי ליגת האלופות', 'גבינת עיזים', '3 תוספות חינם', 'עם תבלין איטלקי', '6 גבינות 14', 'רולס פירות יער שמנת', 'עם רוטב פיסטוק', 'קרם שוקולד נוגט', 'רוטבים', 'רולס בייגלה שמנת ועוגיות', 'גרליק בייטס 24 יחידות', 'מונסטר אולטרה', 'פיצה מיוחדת ממהדורת חורף פיצה קלאסית', 'רולס מרשמלו שמנת ועוגיות', 'אצבעות פסטו קרונפלקס', 'קורנפלקס דליס', '30 קינוח', 'הבלקנית 14', 'עם רוטב בייגלה', 'רולס פצפוצי רושה ופיסטוק שמנת', 'עם רוטב מרשמלו', '5 גבינות', 'פיצה מיוחדת ממהדורת חורף 2 פחיות או מנת נלוות', '8 טבעות גאודה מצופות', 'קלאסית 12', 'ציפס', 'מונסטר מנגו לוקו', 'אצבעות גבינה פסח', 'עם רוטב פירות יער', 'קרונפלקס פטריות 14', 'פיצה l קלאסית מנה נלוות קינוח', '2 יח עוגיות מדלן שוקולדציפס', 'פיצה משפחתית קלאסית באיסוף בנק מזרחי', 'משפחתית קלאסית 2 פחיות', 'שקית לקוח', 'פיצה משפחתית מנה נלווית שתיה גדולה'}
סוננו

KeyError: "Unknown category '54' encountered. Set `add_nan=True` to allow unknown categories"

In [None]:
# debug: בדוק איפה בדיוק 54 חומק
categorical_encoders = training.get_parameters()["categorical_encoders"]
for col, encoder in categorical_encoders.items():
    # רק אם העמודה הזו באמת קיימת ב־val_df
    if col in val_df.columns:
        val_values = set(val_df[col].unique())
        known = set(encoder.classes_)
        unknown = val_values - known
        if unknown:
            print(f"⚠️ עמודה '{col}' מכילה ערכים לא מוכרים: {unknown}")
