<a href="https://colab.research.google.com/github/DzakyN/insightify-web-sales-forecasting/blob/main/Model_TIKEXPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Forecasting metode agregasi model LSTM

In [None]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Attention, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from datetime import timedelta
import holidays
from itertools import product


# 1 Seed
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)


# 2️ Load data
file_path = r"C:\Users\mdzak\Downloads\Data_Final_Setelah_Hapus_Store_8.csv"
df = pd.read_csv(file_path)
df["Date"] = pd.to_datetime(df["Date"])

# 3️ Group by date
df_total = df.groupby("Date")["Weekly_Sales"].sum().reset_index().sort_values("Date")

# 4️ Baseline moving average
window_baseline = 154
df_total["baseline_ma30"] = df_total["Weekly_Sales"].rolling(window=window_baseline, min_periods=1).mean()


# 5️ Tambahkan fitur musiman & lag
df_total["dayofmonth"] = df_total["Date"].dt.day
df_total["weekofyear"] = df_total["Date"].dt.isocalendar().week
df_total["is_month_start"] = df_total["Date"].dt.is_month_start.astype(int)
df_total["sin_week"] = np.sin(2 * np.pi * df_total["weekofyear"] / 52)
df_total["cos_week"] = np.cos(2 * np.pi * df_total["weekofyear"] / 52)
df_total["lag_1"] = df_total["Weekly_Sales"].shift(1)
df_total["lag_52"] = df_total["Weekly_Sales"].shift(52)


# 6️ Tambahkan exogenous features
us_holidays = holidays.US()
df_total["is_holiday_us"] = df_total["Date"].apply(lambda x: 1 if x in us_holidays else 0)
df_total["is_weekend"] = df_total["Date"].dt.weekday.apply(lambda x: 1 if x >= 5 else 0)
df_total["month"] = df_total["Date"].dt.month
df_total["quarter"] = df_total["Date"].dt.quarter
df_total["rolling_mean_7"] = df_total["Weekly_Sales"].rolling(window=7, min_periods=1).mean()
df_total["rolling_std_7"] = df_total["Weekly_Sales"].rolling(window=7, min_periods=1).std()


# Drop NA
df_total.dropna(inplace=True)
df_total.reset_index(drop=True, inplace=True)


# 7️ Target residual
df_total["residual_target"] = df_total["Weekly_Sales"] - df_total["baseline_ma30"]


# 8️ Features
features = ["Weekly_Sales", "lag_1", "lag_52", "dayofmonth", "is_month_start",
            "sin_week", "cos_week", "is_holiday_us", "is_weekend",
            "month", "quarter", "rolling_mean_7", "rolling_std_7"]


# Sliding window
window_size = 84
forecast_horizon = 90
X, y = [], []
for i in range(window_size, len(df_total) - forecast_horizon + 1):
    X_window = df_total[features].iloc[i-window_size:i].values
    y_horizon = df_total["residual_target"].iloc[i:i+forecast_horizon].values
    X.append(X_window)
    y.append(y_horizon)
X, y = np.array(X), np.array(y)


# Split
n = len(X)
cutoff_train = int(n * 0.67)
cutoff_valid = int(n * 0.75)
X_train, y_train = X[:cutoff_train], y[:cutoff_train]
X_valid, y_valid = X[cutoff_train:cutoff_valid], y[cutoff_train:cutoff_valid]
X_test, y_test = X[cutoff_valid:], y[cutoff_valid:]


# Scaling
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_valid_scaled = y_scaler.transform(y_valid)
y_test_scaled = y_scaler.transform(y_test)


x_scaler = MinMaxScaler()
X_train_scaled = x_scaler.fit_transform(X_train.reshape(-1, len(features))).reshape((-1, window_size, len(features)))
X_valid_scaled = x_scaler.transform(X_valid.reshape(-1, len(features))).reshape((-1, window_size, len(features)))
X_test_scaled = x_scaler.transform(X_test.reshape(-1, len(features))).reshape((-1, window_size, len(features)))


# Model stacked LSTM + Attention
def create_model(input_shape, units=128, dropout=0.2, lr=0.001):
    inputs = Input(shape=input_shape)
    x = LSTM(units, return_sequences=True, dropout=dropout)(inputs)
    x = LSTM(units, return_sequences=True, dropout=dropout)(x)
    attn = Attention()([x, x])
    concat = Concatenate()([x, attn])
    x = Dense(256, activation='relu')(concat[:, -1, :])
    x = Dropout(dropout)(x)
    out = Dense(forecast_horizon)(x)
    model = Model(inputs, out)
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')
    return model


# Hyperparameter tuning
param_grid = {
    "units": [64, 128],
    "dropout": [0.1, 0.2],
    "lr": [0.001, 0.0005],
    "batch_size": [16, 32]
}
param_combinations = list(product(param_grid["units"], param_grid["dropout"],
                                  param_grid["lr"], param_grid["batch_size"]))


results = []


best_r2 = -np.inf
best_model = None
best_params = None


for units, dropout, lr, batch_size in param_combinations:
    print(f"🔍 Training: units={units}, dropout={dropout}, lr={lr}, batch_size={batch_size}")
    model = create_model((window_size, len(features)), units=units, dropout=dropout, lr=lr)
    es = EarlyStopping(patience=5, restore_best_weights=True)
    model.fit(X_train_scaled, y_train_scaled,
               validation_data=(X_valid_scaled, y_valid_scaled),
               epochs=50, batch_size=batch_size, verbose=0, callbacks=[es])
    y_valid_pred_scaled = model.predict(X_valid_scaled, verbose=0)
    y_valid_pred = y_scaler.inverse_transform(y_valid_pred_scaled)
    y_valid_real = y_scaler.inverse_transform(y_valid_scaled)
    baseline_valid = df_total["baseline_ma30"].iloc[cutoff_train+window_size:cutoff_valid+window_size].values.reshape(-1,1).repeat(forecast_horizon, axis=1)
    y_valid_pred_final = y_valid_pred + baseline_valid
    y_valid_real_final = y_valid_real + baseline_valid
    mape = mean_absolute_percentage_error(y_valid_real_final, y_valid_pred_final) * 100
    mae = mean_absolute_error(y_valid_real_final, y_valid_pred_final)
    rmse = np.sqrt(mean_squared_error(y_valid_real_final, y_valid_pred_final))
    r2 = r2_score(y_valid_real_final.flatten(), y_valid_pred_final.flatten())
    results.append({
        "units": units,
        "dropout": dropout,
        "lr": lr,
        "batch_size": batch_size,
        "MAPE": round(mape, 2),
        "MAE": round(mae, 2),
        "RMSE": round(rmse, 2),
        "R2": round(r2, 4)
    })
    print(f"➡️ Valid Metrics: MAPE={mape:.2f}%, MAE={mae:.2f}, RMSE={rmse:.2f}, R2={r2:.4f}")
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_params = (units, dropout, lr, batch_size)


# Log semua hasil
print("\n📝 All Hyperparameter Combinations and Validation Metrics:")
df_results = pd.DataFrame(results)
print(df_results)


print("\n✅ Best Hyperparameters (R² Based):")
print(f"Units: {best_params[0]}, Dropout: {best_params[1]}, LR: {best_params[2]}, Batch: {best_params[3]}")


#Final Evaluation on Test
y_test_pred_scaled = best_model.predict(X_test_scaled)
y_test_pred = y_scaler.inverse_transform(y_test_pred_scaled)
y_test_real = y_scaler.inverse_transform(y_test_scaled)
baseline_test = df_total["baseline_ma30"].iloc[-len(y_test):].values.reshape(-1, 1).repeat(forecast_horizon, axis=1)
y_test_pred_final = y_test_pred + baseline_test
y_test_real_final = y_test_real + baseline_test


test_mape = mean_absolute_percentage_error(y_test_real_final, y_test_pred_final) * 100
test_mae = mean_absolute_error(y_test_real_final, y_test_pred_final)
test_rmse = np.sqrt(mean_squared_error(y_test_real_final, y_test_pred_final))
test_r2 = r2_score(y_test_real_final.flatten(), y_test_pred_final.flatten())


print("\n🎯 Final Test Metrics (Best Model - R² Based):")
print("✅ MAPE:", round(test_mape, 2), "%")
print("✅ MAE :", round(test_mae, 2))
print("✅ RMSE:", round(test_rmse, 2))
print("✅ R2  :", round(test_r2, 4))


#Rolling Forecast ke depan 30 hari
last_window_real = df_total[features].values[-window_size:]
last_window_scaled = x_scaler.transform(last_window_real).reshape(1, window_size, len(features))
forecast_scaled = best_model.predict(last_window_scaled)[0]
forecast_residual = y_scaler.inverse_transform(forecast_scaled.reshape(1, -1)).flatten()
baseline_future = df_total["baseline_ma30"].iloc[-1]
forecast_final = forecast_residual + baseline_future


# Save & visualisasi
last_date = df_total["Date"].max()
future_dates = [last_date + timedelta(days=i) for i in range(1, forecast_horizon + 1)]
df_forecast = pd.DataFrame({"Date": future_dates, "Forecast": forecast_final})
df_actual = df_total[["Date", "Weekly_Sales"]].rename(columns={"Weekly_Sales": "Actual"})
df_final = pd.concat([df_actual, df_forecast.rename(columns={"Forecast": "Actual"})], ignore_index=True)
df_final.to_csv("Total_Sales_LSTM_Hybrid_USA_Forecast.csv", index=False)


plt.figure(figsize=(12, 5))
plt.plot(df_actual["Date"], df_actual["Actual"], label="Actual")
plt.plot(df_forecast["Date"], df_forecast["Forecast"], label="Forecast", linestyle="--")
plt.axvline(x=last_date, color='red', linestyle=':', label="Forecast Start")
plt.title(f"Hybrid LSTM+Attention 30D Forecast (MAPE: {test_mape:.2f}%)")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


print("🎉 Final hybrid residual forecast & hyperparameter tuning selesai!")


#Forecasting metode Granulalitas model Prophet (105 Kombinasi)

In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_csv(r"C:\Semester 6\Setsuna\Data_Aktual_Tanpa_Forecast.csv")

df['ds'] = pd.to_datetime(df['Date'])

df = df.rename(columns={'Daily_Sales': 'y'})

df = df[['ds', 'Store', 'Dept', 'y']].sort_values('ds')

stores = df['Store'].unique()
depts = df['Dept'].unique()


# Grid search parameter
param_grid = {
    'changepoint_prior_scale': [0.001, 0.01, 0.1,0.2, 0.5, 1.0,10.0],
    'seasonality_prior_scale': [0.001,0.01, 0.1, 1.0,0.2,0.5, 10.0]
}

# Hasil akhir
evaluasi_list = []
forecast_list = []


for store in stores:
    for dept in depts:
        subset = df[(df['Store'] == store) & (df['Dept'] == dept)].copy()
        if len(subset) < 100:
            continue
        # Split train-valid-test
        n = len(subset)
        cutoff_train = int(n * 0.7)
        cutoff_valid = int(n * 0.87)


        df_train = subset.iloc[:cutoff_train].copy()
        df_valid = subset.iloc[cutoff_train:cutoff_valid].copy()
        df_test = subset.iloc[cutoff_valid:].copy()


        # Tuning
        tuning_results = []
        for cps in param_grid['changepoint_prior_scale']:
            for sps in param_grid['seasonality_prior_scale']:
                try:
                    model = Prophet(
                        daily_seasonality=True,
                        weekly_seasonality=True,
                        yearly_seasonality=True,
                        changepoint_prior_scale=cps,
                        seasonality_prior_scale=sps
                    )
                    model.fit(df_train)
                    forecast_valid = model.predict(df_valid[['ds']])
                    y_true = df_valid['y'].values
                    y_pred = forecast_valid['yhat'].values
                    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-5))) * 100
                    tuning_results.append({
                        'cps': cps,
                        'sps': sps,
                        'mape': mape
                    })
                except:
                    continue

        if not tuning_results:
            continue

        best_param = sorted(tuning_results, key=lambda x: x['mape'])[0]
        best_cps = best_param['cps']
        best_sps = best_param['sps']


        # Rolling Forecast pada Test Set
        rolling_train = pd.concat([df_train, df_valid])
        y_true_all = []
        y_pred_all = []

        for i in range(len(df_test)):
            model = Prophet(
                daily_seasonality=True,
                weekly_seasonality=True,
                yearly_seasonality=True,
                changepoint_prior_scale=best_cps,
                seasonality_prior_scale=best_sps
            )
            model.fit(rolling_train)
            future = df_test[['ds']].iloc[i:i+1]
            forecast = model.predict(future)


            y_true_all.append(df_test['y'].iloc[i])
            y_pred_all.append(forecast['yhat'].values[0])
            rolling_train = pd.concat([rolling_train, df_test.iloc[i:i+1]], ignore_index=True)

        # Evaluasi
        rmse = np.sqrt(mean_squared_error(y_true_all, y_pred_all))
        mae = mean_absolute_error(y_true_all, y_pred_all)
        mape = np.mean(np.abs((np.array(y_true_all) - np.array(y_pred_all)) / np.maximum(np.abs(y_true_all), 1e-5))) * 100


        evaluasi_list.append({
            'Store': store,
            'Dept': dept,
            'Best CPS': best_cps,
            'Best SPS': best_sps,
            'RMSE': round(rmse, 3),
            'MAE': round(mae, 3),
            'MAPE': round(mape, 6)
        })


        # atur forecast
        full_df = pd.concat([df_train, df_valid, df_test])
        model_final = Prophet(
            daily_seasonality=True,
            weekly_seasonality=True,
            yearly_seasonality=True,
            changepoint_prior_scale=best_cps,
            seasonality_prior_scale=best_sps
        )
        model_final.fit(full_df)
        future = model_final.make_future_dataframe(periods=90)
        forecast = model_final.predict(future)
        last_date = full_df['ds'].max()
        forecast_future = forecast[forecast['ds'] > last_date].copy()
        forecast_future['Store'] = store
        forecast_future['Dept'] = dept
        forecast_list.append(forecast_future[['ds', 'Store', 'Dept', 'yhat']])


# Simpan Evaluasi dan Forecast
df_evaluasi = pd.DataFrame(evaluasi_list)
df_evaluasi.to_csv("evaluasi_model2.csv", index=False)


df_forecast_all = pd.concat(forecast_list, ignore_index=True)
df_forecast_all = df_forecast_all.rename(columns={'ds': 'Date', 'yhat': 'Daily_Sales'})
df_forecast_all.to_csv("Forecast Final.csv", index=False)


print("✅ Selesai: evaluasi_model.csv & forecast_90hari.csv berhasil dibuat.")





# Sentimen Analisis

In [None]:

!pip install nltk==3.8.1 -q
!pip install -U imbalanced-learn -q

import shutil
shutil.rmtree('/content/nltk_data', ignore_errors=True)
shutil.rmtree('/root/nltk_data', ignore_errors=True)

import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

import joblib  # untuk menyimpan model/vectorizer

==
nltk.download('stopwords')
nltk.download('punkt')



# 1. Load dataset
df = pd.read_csv("/content/Data Sentimen.csv")

# 2. Stopwords default dari NLTK (Bahasa Indonesia)
stop_words = set(stopwords.words('indonesian'))

# 3. Kata-kata penting yang TIDAK BOLEH DIHAPUS karena punya makna sentimen/konteks
kata_penting = {
    # Penyangkalan / penekanan
    'tidak', 'kurang', 'belum', 'hanya', 'seharusnya',

    # Kata-kata temporal / kecepatan / kualitas layanan
    'lama', 'cepat', 'lambat', 'terlambat', 'lelet',

    # Kata netral / ambiguitas
    'biasa', 'lumayan', 'cukup', 'standar','saja'

    # Kata negatif umum
    'buruk', 'jelek', 'rusak', 'cacat', 'telat',

    # Kata positif umum
    'bagus', 'baik', 'puas', 'mantap', 'ramah',

    # Kata ekspresi
    'wah', 'wow', 'parah', 'sumpah', 'love', 'suka', 'senang',

    # Kata soal pelayanan / barang
    'kasir', 'produk', 'barang', 'layanan', 'pengiriman', 'kurir'
}

# 4. Hapus kata penting dari stopwords agar tidak terhapus saat filtering
stop_words -= kata_penting

# 5. Fungsi preprocessing
def preprocess_nltk(text):
    text = text.lower()
    text = re.sub(r'(.)\1{2,}', r'\1', text)               # hilangkan huruf berulang, misal baaagus → bagus
    text = re.sub(r'[^a-zA-Z\s]', '', text)                # hilangkan karakter non-alfabet
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words]
    return " ".join(filtered)

# 6. Terapkan preprocessing
df['clean_text'] = df['review_text'].astype(str).apply(preprocess_nltk)


tfidf = TfidfVectorizer(
    max_features=7000,
    min_df=2,
    max_df=0.95,
    ngram_range=(1, 2)
)

X = tfidf.fit_transform(df['clean_text'])
y = df['label_sentimen']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
model.fit(X_train_smote, y_train_smote)

y_pred = model.predict(X_test)

print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nLaporan klasifikasi:\n", classification_report(y_test, y_pred))


joblib.dump(model, 'model_rf.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Akurasi: 0.9325581395348838

Laporan klasifikasi:
               precision    recall  f1-score   support

     negatif       0.90      0.91      0.91       956
      netral       0.98      0.93      0.96       722
     positif       0.93      0.95      0.94      1332

    accuracy                           0.93      3010
   macro avg       0.94      0.93      0.93      3010
weighted avg       0.93      0.93      0.93      3010



['tfidf_vectorizer.pkl']

In [None]:
# ================================
# 🎤 UJI INPUT MANUAL
# ================================

def prediksi_komentar(text_input):
    # Preprocessing
    clean = preprocess_nltk(text_input)

    # TF-IDF transform (jangan .fit_transform lagi, cukup .transform!)
    vectorized = tfidf.transform([clean])

    # Prediksi
    hasil = model.predict(vectorized)[0]

    print(f"\n🗣️ Input: {text_input}")
    print(f"🧹 Cleaned: {clean}")
    print(f"📊 Prediksi Sentimen: {hasil}")

# Contoh uji coba:
prediksi_komentar("pengiriman sangat cepat dan pelayanan ramah sekali")
prediksi_komentar("barang rusak, tidak sesuai dengan gambar")
prediksi_komentar("lumayan, tapi pengemasannya biasa saja")
prediksi_komentar("barang rusak")
prediksi_komentar("Kasirnya lama")
prediksi_komentar (" barangnya jelek")
prediksi_komentar ("Pengirimannya lama banget anjerrrr")
prediksi_komentar ("kasirnya ga ramah")
prediksi_komentar ("Antrean kasir di Walmart terlalu panjang dan lambat.")
prediksi_komentar ("asli disituh mahal bangett")



🗣️ Input: pengiriman sangat cepat dan pelayanan ramah sekali
🧹 Cleaned: pengiriman cepat pelayanan ramah
📊 Prediksi Sentimen: positif

🗣️ Input: barang rusak, tidak sesuai dengan gambar
🧹 Cleaned: barang rusak tidak sesuai gambar
📊 Prediksi Sentimen: negatif

🗣️ Input: lumayan, tapi pengemasannya biasa saja
🧹 Cleaned: lumayan pengemasannya biasa
📊 Prediksi Sentimen: netral

🗣️ Input: barang rusak
🧹 Cleaned: barang rusak
📊 Prediksi Sentimen: negatif

🗣️ Input: Kasirnya lama
🧹 Cleaned: kasirnya lama
📊 Prediksi Sentimen: negatif

🗣️ Input:  barangnya jelek
🧹 Cleaned: barangnya jelek
📊 Prediksi Sentimen: negatif

🗣️ Input: Pengirimannya lama banget anjerrrr
🧹 Cleaned: pengirimannya lama banget anjer
📊 Prediksi Sentimen: negatif

🗣️ Input: kasirnya ga ramah
🧹 Cleaned: kasirnya ga ramah
📊 Prediksi Sentimen: positif

🗣️ Input: Antrean kasir di Walmart terlalu panjang dan lambat.
🧹 Cleaned: antrean kasir walmart lambat
📊 Prediksi Sentimen: negatif

🗣️ Input: asli disituh mahal bangett
🧹 Clean