In [1]:
import pandas as pd

# Contoh data
data = pd.DataFrame({
    'user_id': [1, 1, 1, 2, 2, 2, 1, 1],
    'item_id': ['A', 'A', 'B', 'A', 'A', 'B', 'B', 'B'],
    'tanggal': pd.date_range(start="2024-01-01", periods=8, freq='D'),
    'jumlah_terjual': [10, 12, 5, 7, 9, 4, 6, 8]
})

# Mengelompokkan berdasarkan user_id dan item_id
grouped_data = { (user, item): df for (user, item), df in data.groupby(['user_id', 'item_id']) }

# Cek data untuk user_id=1, item_id=A
print(grouped_data[(1, 'A')])

   user_id item_id    tanggal  jumlah_terjual
0        1       A 2024-01-01              10
1        1       A 2024-01-02              12


In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import pickle
import os

def train_xgboost(user_id, item_id, df):
    df = df.sort_values(by='tanggal')  # Pastikan data diurutkan berdasarkan waktu
    df['day'] = (df['tanggal'] - df['tanggal'].min()).dt.days  # Konversi tanggal ke angka

    X = df[['day']]
    y = df['jumlah_terjual']

    # Sliding window split: 90% training, 10% testing
    train_size = int(len(df) * 0.9)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Train model
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
    model.fit(X_train, y_train)

    # Buat folder jika belum ada
    directory = f"./models/{user_id}"
    os.makedirs(directory, exist_ok=True)

    # Simpan model ke file
    filename = f"./models/{user_id}/{item_id}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

    print(f"Model untuk {user_id} - {item_id} disimpan di {filename}")

# Contoh training untuk user_id=1, item_id=A
train_xgboost(1, 'A', grouped_data[(1, 'A')])


Model untuk 1 - A disimpan di ./models/1/A.pkl


In [11]:
def predict_sales(user_id, item_id, df, future_days=3):
    filename = f'./models/{user_id}/{item_id}.pkl'
    with open(filename, 'rb') as f:
        model = pickle.load(f)

    # Lastest Date
    df = df.sort_values(by='tanggal')  # Pastikan data diurutkan berdasarkan waktu
    df['day'] = (df['tanggal'] - df['tanggal'].min()).dt.days  # Konversi tanggal ke angka

    latest = df['day'].max()

    # Prediksi untuk hari ke depan
    future_X = [[i] for i in range(latest, latest + future_days)]  # Misalnya dari hari terakhir dari data hingga ke-future days
    predictions = model.predict(future_X)

    return predictions

# Contoh prediksi
prediksi = predict_sales(1, 'A', grouped_data[(1, 'A')], 3)
print(f"Prediksi penjualan untuk 3 hari kedepan : {prediksi}")

Prediksi penjualan untuk 3 hari kedepan : [11.99938    5.0024557  7.0005198]


In [10]:
def update_model(user_id, item_id, updated_data):
    filename = f'./models/{user_id}/{item_id}.pkl'

    # Load model lama
    with open(filename, 'rb') as f:
        model = pickle.load(f)

    X = updated_data[['day']]
    y = updated_data['jumlah_terjual']

    # Train model dengan data baru (Warm Start)
    model.fit(X, y, xgb_model=model)  # Warm start training

    # Simpan ulang model
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

    print(f"Model {user_id} - {item_id} diperbarui!")

# Contoh update dengan data baru
new_sales_data = pd.DataFrame({
    'user_id': [1],
    'item_id': ['A'],
    'tanggal': [pd.Timestamp("2024-01-12")],
    'jumlah_terjual': [13]
})

# Update data (append data baru ke dataset lama)
updated_data = pd.concat([data, new_sales_data]).sort_values(by='tanggal')
updated_data['day'] = (updated_data['tanggal'] - updated_data['tanggal'].min()).dt.days

tes_data = { (user, item): df for (user, item), df in updated_data.groupby(['user_id', 'item_id']) }

update_model(1, 'A', tes_data[(1, 'A')])

Model 1 - A diperbarui!


In [7]:
tes_data = { (user, item): df for (user, item), df in updated_data.groupby(['user_id', 'item_id']) }
tes_data

{(1,
  'A'):    user_id item_id    tanggal  jumlah_terjual  day
 0        1       A 2024-01-01              10    0
 1        1       A 2024-01-02              12    1
 0        1       A 2024-01-12              13   11,
 (1,
  'B'):    user_id item_id    tanggal  jumlah_terjual  day
 2        1       B 2024-01-03               5    2
 6        1       B 2024-01-07               6    6
 7        1       B 2024-01-08               8    7,
 (2,
  'A'):    user_id item_id    tanggal  jumlah_terjual  day
 3        2       A 2024-01-04               7    3
 4        2       A 2024-01-05               9    4,
 (2,
  'B'):    user_id item_id    tanggal  jumlah_terjual  day
 5        2       B 2024-01-06               4    5}

In [8]:
tes_data[(1, 'A')]

Unnamed: 0,user_id,item_id,tanggal,jumlah_terjual,day
0,1,A,2024-01-01,10,0
1,1,A,2024-01-02,12,1
0,1,A,2024-01-12,13,11


In [32]:
import pandas as pd
import numpy as np
import os

def generate_dummy_data(num_users, num_items_per_user, num_records):
    users = np.random.randint(1, num_users + 1, num_records)
    items = np.array([chr(65 + (user - 1) * num_items_per_user + np.random.randint(0, num_items_per_user)) for user in users])
    dates = pd.to_datetime("2024-01-01") + pd.to_timedelta(np.random.randint(0, 300, num_records), unit='D')
    jumlah_terjual = np.random.randint(1, 36, num_records)

    df = pd.DataFrame({'user_id': users, 'item_id': items, 'tanggal': dates, 'jumlah_terjual': jumlah_terjual})
    df = df.drop_duplicates(subset=['user_id', 'item_id', 'tanggal']).sort_values(by=['tanggal', 'user_id', 'item_id']).reset_index(drop=True)

    return df

dummy_data = generate_dummy_data(num_users=1, num_items_per_user=2, num_records=300)
print(dummy_data)

directory = "data"
os.makedirs(directory, exist_ok=True)
dummy_data.to_csv(f"./{directory}/dummy.csv", index=False)
print("Data dummy telah disimpan di data/dummy.csv")


     user_id item_id    tanggal  jumlah_terjual
0          1       A 2024-01-03              34
1          1       B 2024-01-03              18
2          1       B 2024-01-05              24
3          1       A 2024-01-06              14
4          1       B 2024-01-07              27
..       ...     ...        ...             ...
220        1       B 2024-10-22              35
221        1       A 2024-10-23              20
222        1       A 2024-10-24               1
223        1       B 2024-10-24              10
224        1       B 2024-10-26              13

[225 rows x 4 columns]
Data dummy telah disimpan di data/dummy.csv


In [33]:
data = pd.read_csv('./data/dummy.csv')
data

Unnamed: 0,user_id,item_id,tanggal,jumlah_terjual
0,1,A,2024-01-03,34
1,1,B,2024-01-03,18
2,1,B,2024-01-05,24
3,1,A,2024-01-06,14
4,1,B,2024-01-07,27
...,...,...,...,...
220,1,B,2024-10-22,35
221,1,A,2024-10-23,20
222,1,A,2024-10-24,1
223,1,B,2024-10-24,10


In [35]:
grouped_data = { (user, item): df for (user, item), df in data.groupby(['user_id', 'item_id']) }
grouped_data[(1, 'B')]

Unnamed: 0,user_id,item_id,tanggal,jumlah_terjual
1,1,B,2024-01-03,18
2,1,B,2024-01-05,24
4,1,B,2024-01-07,27
10,1,B,2024-01-17,19
12,1,B,2024-01-18,33
...,...,...,...,...
213,1,B,2024-10-15,28
217,1,B,2024-10-19,23
220,1,B,2024-10-22,35
223,1,B,2024-10-24,10


In [36]:
print(len(grouped_data[(1, 'A')]))
print(len(grouped_data[(1, 'B')]))

124
101


In [2]:
import pandas as pd
import numpy as np
import os

def generate_custom_dummy_data(start_date='2025-02-27', num_days=120, item_id='MDA382', harga_item=5000):
    # Generate range tanggal dari start_date
    tanggal = pd.date_range(start=start_date, periods=num_days, freq='D')
    jumlah_terjual = np.random.randint(1, 51, size=num_days)  # random dari 1 sampai 50

    # Buat DataFrame
    df = pd.DataFrame({
        'item_id': [item_id] * num_days,
        'tanggal': tanggal,
        'jumlah_terjual': jumlah_terjual,
        'harga_item': [harga_item] * num_days
    })

    return df

# Generate data
dummy_data = generate_custom_dummy_data()

# Simpan ke file CSV
directory = "data"
os.makedirs(directory, exist_ok=True)
dummy_data.to_csv(f"./{directory}/dummy2.csv", index=False)
print("Data dummy telah disimpan di data/dummy.csv")


Data dummy telah disimpan di data/dummy.csv
