# Coffee Shop Sales Prediction - Improved Model

Notebook ini berisi training model yang sudah ditingkatkan dengan:
- ‚úÖ Feature engineering yang lebih baik (holidays, seasonality)
- ‚úÖ Actual unit price per product
- ‚úÖ Model evaluation yang lebih komprehensif
- ‚úÖ Confidence interval estimation

## 1. Import Libraries

In [1]:
# Library utama untuk manipulasi data
import pandas as pd
import numpy as np
from datetime import datetime

# Library untuk model regresi
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Library untuk evaluasi model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Untuk menyimpan model
import joblib

# Untuk visualisasi
import matplotlib.pyplot as plt
import seaborn as sns

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


## 2. Load Dataset

In [2]:
# Load dataset dari file Excel
# Jika di Colab, upload file terlebih dahulu dengan:
# from google.colab import files
# uploaded = files.upload()

df = pd.read_excel("Coffee_Shop.xlsx", sheet_name="Transactions")

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Dataset shape: (149116, 11)

Columns: ['transaction_id', 'transaction_date', 'transaction_time', 'transaction_qty', 'store_id', 'store_location', 'product_id', 'unit_price', 'product_category', 'product_type', 'product_detail']


Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,product_id,unit_price,product_category,product_type,product_detail
0,1,2023-01-01,07:06:11,2,5,Lower Manhattan,32,3.0,Coffee,Gourmet brewed coffee,Ethiopia Rg
1,2,2023-01-01,07:08:56,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg
2,3,2023-01-01,07:14:04,2,5,Lower Manhattan,59,4.5,Drinking Chocolate,Hot chocolate,Dark chocolate Lg
3,4,2023-01-01,07:20:24,1,5,Lower Manhattan,22,2.0,Coffee,Drip coffee,Our Old Time Diner Blend Sm
4,5,2023-01-01,07:22:41,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg


## 3. Data Preprocessing & Feature Engineering

In [3]:
# Pilih kolom yang relevan
df = df[
    [
        'transaction_id',
        'transaction_date',
        'transaction_time',
        'transaction_qty',      # TARGET
        'product_id',
        'product_category',
        'product_type',
        'product_detail',
        'unit_price'
    ]
]

# Hapus missing values
df = df.dropna()

print(f"Data after removing NaN: {df.shape}")
df.head()

Data after removing NaN: (149116, 9)


Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,product_id,product_category,product_type,product_detail,unit_price
0,1,2023-01-01,07:06:11,2,32,Coffee,Gourmet brewed coffee,Ethiopia Rg,3.0
1,2,2023-01-01,07:08:56,2,57,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg,3.1
2,3,2023-01-01,07:14:04,2,59,Drinking Chocolate,Hot chocolate,Dark chocolate Lg,4.5
3,4,2023-01-01,07:20:24,1,22,Coffee,Drip coffee,Our Old Time Diner Blend Sm,2.0
4,5,2023-01-01,07:22:41,2,57,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg,3.1


In [4]:
# Parse tanggal dan waktu
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['transaction_time'] = pd.to_datetime(df['transaction_time'], format='%H:%M:%S', errors='coerce')

# Extract waktu features
df['hour'] = df['transaction_time'].dt.hour
df['day_of_week'] = df['transaction_date'].dt.dayofweek  # 0=Monday, 6=Sunday
df['month'] = df['transaction_date'].dt.month
df['day_of_month'] = df['transaction_date'].dt.day
df['week_of_year'] = df['transaction_date'].dt.isocalendar().week
df['quarter'] = df['transaction_date'].dt.quarter

# Weekend flag
df['weekend'] = (df['day_of_week'] >= 5).astype(int)

# Seasonality (musim: 1=Winter, 2=Spring, 3=Summer, 4=Fall)
def get_season(month):
    if month in [12, 1, 2]:
        return 1  # Winter
    elif month in [3, 4, 5]:
        return 2  # Spring
    elif month in [6, 7, 8]:
        return 3  # Summer
    else:
        return 4  # Fall

df['season'] = df['month'].apply(get_season)

# Month start/end indicators
df['is_month_start'] = (df['day_of_month'] <= 7).astype(int)
df['is_month_end'] = (df['day_of_month'] >= 24).astype(int)

# Simple holiday detection for US (bisa disesuaikan)
us_holidays_2023 = [
    '2023-01-01',  # New Year
    '2023-07-04',  # Independence Day
    '2023-11-23',  # Thanksgiving (approx)
    '2023-12-25',  # Christmas
]

df['is_holiday'] = df['transaction_date'].astype(str).isin(us_holidays_2023).astype(int)

print("‚úÖ Time features extracted")
print(f"Hour range: {df['hour'].min()} - {df['hour'].max()}")
print(f"Date range: {df['transaction_date'].min()} to {df['transaction_date'].max()}")

‚úÖ Time features extracted
Hour range: 6 - 20
Date range: 2023-01-01 00:00:00 to 2023-06-30 00:00:00


In [5]:
# Encode product_type menggunakan LabelEncoder
le_product = LabelEncoder()
df['product_type_encoded'] = le_product.fit_transform(df['product_type'])

print(f"Product types: {len(le_product.classes_)}")
print(f"Classes: {le_product.classes_}")

Product types: 29
Classes: ['Barista Espresso' 'Biscotti' 'Black tea' 'Brewed Black tea'
 'Brewed Chai tea' 'Brewed Green tea' 'Brewed herbal tea' 'Chai tea'
 'Clothing' 'Drinking Chocolate' 'Drip coffee' 'Espresso Beans'
 'Gourmet Beans' 'Gourmet brewed coffee' 'Green beans' 'Green tea'
 'Herbal tea' 'Hot chocolate' 'House blend Beans' 'Housewares'
 'Organic Beans' 'Organic Chocolate' 'Organic brewed coffee' 'Pastry'
 'Premium Beans' 'Premium brewed coffee' 'Regular syrup' 'Scone'
 'Sugar free syrup']


## 4. Feature Selection & Target Preparation

In [6]:
# Fitur yang akan digunakan untuk training
feature_cols = [
    'hour',
    'day_of_week',
    'month',
    'weekend',
    'product_type_encoded',
    'unit_price',
    'day_of_month',
    'quarter',
    'season',
    'is_month_start',
    'is_month_end',
    'is_holiday'
]

X = df[feature_cols]

# Target dengan log transformation untuk stabilitas
y_log = np.log1p(df['transaction_qty'])

print(f"Features shape: {X.shape}")
print(f"Target shape: {y_log.shape}")
print(f"\nFeature columns: {feature_cols}")

Features shape: (149116, 12)
Target shape: (149116,)

Feature columns: ['hour', 'day_of_week', 'month', 'weekend', 'product_type_encoded', 'unit_price', 'day_of_month', 'quarter', 'season', 'is_month_start', 'is_month_end', 'is_holiday']


## 5. Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_log,
    test_size=0.2,
    random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Training set: (119292, 12)
Test set: (29824, 12)


## 6. Model Training & Evaluation

In [8]:
# Model 1: Linear Regression
print("Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"‚úÖ Linear Regression trained")
print(f"   MAE: {mae_lr:.4f}")
print(f"   RMSE: {rmse_lr:.4f}")
print(f"   R¬≤: {r2_lr:.4f}")

Training Linear Regression...
‚úÖ Linear Regression trained
   MAE: 0.1984
   RMSE: 0.2075
   R¬≤: 0.0369


In [9]:
# Model 2: Random Forest
print("Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"‚úÖ Random Forest trained")
print(f"   MAE: {mae_rf:.4f}")
print(f"   RMSE: {rmse_rf:.4f}")
print(f"   R¬≤: {r2_rf:.4f}")

Training Random Forest...
‚úÖ Random Forest trained
   MAE: 0.1693
   RMSE: 0.1923
   R¬≤: 0.1732


In [10]:
# Model 3: Gradient Boosting (bonus)
print("Training Gradient Boosting...")
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

mae_gb = mean_absolute_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
r2_gb = r2_score(y_test, y_pred_gb)

print(f"‚úÖ Gradient Boosting trained")
print(f"   MAE: {mae_gb:.4f}")
print(f"   RMSE: {rmse_gb:.4f}")
print(f"   R¬≤: {r2_gb:.4f}")

Training Gradient Boosting...
‚úÖ Gradient Boosting trained
   MAE: 0.1717
   RMSE: 0.1932
   R¬≤: 0.1653


In [11]:
# Comparison table
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'MAE': [mae_lr, mae_rf, mae_gb],
    'RMSE': [rmse_lr, rmse_rf, rmse_gb],
    'R¬≤ Score': [r2_lr, r2_rf, r2_gb]
})

results = results.sort_values('R¬≤ Score', ascending=False)
print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
print(results.to_string(index=False))
print("="*60)


MODEL COMPARISON
            Model      MAE     RMSE  R¬≤ Score
    Random Forest 0.169302 0.192282  0.173243
Gradient Boosting 0.171696 0.193204  0.165297
Linear Regression 0.198446 0.207528  0.036945


## 7. Feature Importance (untuk Random Forest/GB)

In [12]:
# Pilih model terbaik berdasarkan R¬≤
best_model_name = results.iloc[0]['Model']
if best_model_name == 'Linear Regression':
    best_model = lr_model
elif best_model_name == 'Random Forest':
    best_model = rf_model
else:
    best_model = gb_model

print(f"üèÜ Best Model: {best_model_name}")
print(f"   R¬≤ Score: {results.iloc[0]['R¬≤ Score']:.4f}")

# Feature importance (jika bukan linear regression)
if best_model_name != 'Linear Regression':
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_importance.to_string(index=False))

üèÜ Best Model: Random Forest
   R¬≤ Score: 0.1732

Feature Importance:
             Feature  Importance
          unit_price    0.421836
product_type_encoded    0.372318
        day_of_month    0.100996
                hour    0.072158
         day_of_week    0.011585
               month    0.007951
      is_month_start    0.006293
              season    0.003294
             weekend    0.001263
             quarter    0.001228
        is_month_end    0.001029
          is_holiday    0.000047


## 8. Save Model & Encoder

In [13]:
# Simpan model terbaik
joblib.dump(best_model, "model_prediksi_penjualan.pkl")
joblib.dump(le_product, "label_encoder.pkl")

print("‚úÖ Model dan encoder berhasil disimpan!")
print("   - model_prediksi_penjualan.pkl")
print("   - label_encoder.pkl")

‚úÖ Model dan encoder berhasil disimpan!
   - model_prediksi_penjualan.pkl
   - label_encoder.pkl


## 9. Test Prediction dengan Sample Data

In [14]:
# Load model untuk testing
loaded_model = joblib.load("model_prediksi_penjualan.pkl")
loaded_encoder = joblib.load("label_encoder.pkl")

# Contoh prediksi
sample_data = pd.DataFrame([{
    'hour': 9,
    'day_of_week': 1,  # Tuesday
    'month': 1,
    'weekend': 0,
    'product_type_encoded': loaded_encoder.transform(['Gourmet brewed coffee'])[0],
    'unit_price': 3.0,
    'day_of_month': 15,
    'quarter': 1,
    'season': 1,  # Winter
    'is_month_start': 0,
    'is_month_end': 0,
    'is_holiday': 0
}])

pred_log = loaded_model.predict(sample_data)
pred_qty = np.expm1(pred_log)[0]

print("Sample Prediction:")
print(f"  Product: Gourmet brewed coffee")
print(f"  Time: Tuesday, 9 AM, January")
print(f"  Predicted Quantity: {pred_qty:.2f} cups")

Sample Prediction:
  Product: Gourmet brewed coffee
  Time: Tuesday, 9 AM, January
  Predicted Quantity: 1.45 cups


## 10. Save Preprocessed Dataset (Optional)

In [15]:
# Simpan dataset yang sudah diproses untuk analisis lebih lanjut
df.to_csv("dataset_preprocessing_final.csv", index=False)
print("‚úÖ Dataset yang sudah diproses disimpan ke dataset_preprocessing_final.csv")

‚úÖ Dataset yang sudah diproses disimpan ke dataset_preprocessing_final.csv


## Download Files (untuk Google Colab)

In [16]:
# Uncomment jika menggunakan Google Colab
# from google.colab import files
# files.download("model_prediksi_penjualan.pkl")
# files.download("label_encoder.pkl")
# files.download("dataset_preprocessing_final.csv")