In [23]:
import numpy as np
import pandas as pd
import random

# ==================================================
# Config
# ==================================================
np.random.seed(42)
random.seed(42)

products = [
    "Air Mineral Galon", "Air Mineral Botol", "Air Mineral Gelas",
    "Beras Premium", "Beras Medium", "Beras Curah",
    "Minyak Goreng 1L", "Minyak Goreng 2L",
    "Gula Pasir", "Gula Merah",
    "Tepung Terigu", "Tepung Sagu",
    "Telur Ayam", "Mie Goreng", "Mie Kuah",
    "Kopi Kapal Api", "Susu Kental", "Teh Celup",
    "Sabun Cuci", "Deterjen",
    "Gas LPG 3kg", "Gas LPG 12kg",
    "Rokok Kretek", "Rokok Filter"
]

# --------------------------------------------------
def seasonal_weekly_factor(day):
    # weekend effect
    if day.weekday() >= 5:
        return 1.10
    return 1.00

# --------------------------------------------------
def holiday_factor(day):
    # Christmas
    if day.month == 12 and day.day > 15:
        return 1.15
    return 1.0

# --------------------------------------------------
def ramadhan_factor(day):
    # approximate – realistic Indonesian Ramadan 2025: March 1 – March 31
    if day.month == 3:
        return 1.25
    return 1.0

# --------------------------------------------------
def inflation_curve(t):
    # slow inflation
    return 1 + 0.005 * t

# --------------------------------------------------
def random_shock():
    # occasionally big event
    if random.random() < 0.03:
        return 1.35
    return 1.0

# --------------------------------------------------
def get_base(product_id):
    # base level per product roughly realistic
    base = {
        1: 40,   2: 35,   3: 30,    # water
        4: 32,   5: 34,   6: 20,    # rice
        7: 25,   8: 19,             # cooking oil
        9: 18,   10: 11,            # sugar
        11: 12,  12: 10,            # flour
        13: 20,                     # eggs
        14: 16,  15: 14,            # instant noodles
        16: 10,  17: 14,  18: 12,   # coffee tea dairy
        19: 9,   20: 9,             # soap detergent
        21: 13,  22: 6,             # LPG
        23: 38, 24: 35              # tobacco ≈ water
    }
    return base[product_id]

# --------------------------------------------------
def generate_product(product_id, product_name):
    dates = pd.date_range("2024-01-01", periods=365, freq="D")

    data = []
    for i, d in enumerate(dates):

        # baseline
        base = get_base(product_id)

        # trend
        value = base * inflation_curve(i)

        # weekly cycle
        value *= seasonal_weekly_factor(d)

        # season
        value *= holiday_factor(d)
        value *= ramadhan_factor(d)

        # event shock
        value *= random_shock()

        # product–specific noise (higher for bulk items)
        noise = np.random.normal(1, 0.08)
        value *= noise

        data.append([
            product_id,
            d.strftime("%Y-%m-%d"),
            round(max(0, value), 2)
        ])

    df = pd.DataFrame(data, columns=["product_id", "date", "sales"])
    return df

# ==================================================
# generate full dataset
# ==================================================
all_df = []
for i, name in enumerate(products):
    pid = i + 1
    df = generate_product(pid, name)
    all_df.append(df)

full = pd.concat(all_df, ignore_index=True)

# save
full.to_csv("sales.csv", index=False)
print("done,  sales.csv written! it's up to a damn year")


done,  sales.csv written! it's up to a damn year


In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib

# ======================
# load data
# ======================
df = pd.read_csv("sales.csv", parse_dates=["date"])

# where model files will be saved
MODEL_PATH = "./models/"
import os
os.makedirs(MODEL_PATH, exist_ok=True)

# ======================
# training per product
# ======================

products = df["product_id"].unique()

for p in products:
    print(f"---- training product {p} ----")

    # filter that product
    df_p = df[df.product_id == p].sort_values("date")
    values = df_p["sales"].values
    
    # ---------
    # sliding window (lag 7 → target next day)
    # ---------
    X, y = [], []
    window = 7
    
    for i in range(len(values) - window):
        X.append(values[i:i+window])         # last 7 days
        y.append(values[i+window])           # next day target

    X = np.array(X)
    y = np.array(y)

    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # train model
    model = RandomForestRegressor(
        n_estimators=200,
        random_state=42
    )
    model.fit(X_train, y_train)

    # simple evaluation
    score = model.score(X_test, y_test)
    print("r2 score:", score)

    # save per product
    filename = f"{MODEL_PATH}model_{p}.pkl"
    joblib.dump(model, filename)

    print(f"model saved: {filename}\n")

print("=== done ===")


---- training product 1 ----
r2 score: 0.8526074374493747
model saved: ./models/model_1.pkl

---- training product 2 ----
r2 score: 0.7471562930694509
model saved: ./models/model_2.pkl

---- training product 3 ----
r2 score: 0.8447817174911947
model saved: ./models/model_3.pkl

---- training product 4 ----
r2 score: 0.8358687596551794
model saved: ./models/model_4.pkl

---- training product 5 ----
r2 score: 0.84216020261927
model saved: ./models/model_5.pkl

---- training product 6 ----
r2 score: 0.7777731571287749
model saved: ./models/model_6.pkl

---- training product 7 ----
r2 score: 0.8161262302780062
model saved: ./models/model_7.pkl

---- training product 8 ----
r2 score: 0.8545094622782895
model saved: ./models/model_8.pkl

---- training product 9 ----
r2 score: 0.7905442661245958
model saved: ./models/model_9.pkl

---- training product 10 ----
r2 score: 0.7982097210227859
model saved: ./models/model_10.pkl

---- training product 11 ----
r2 score: 0.7589881322149851
model saved