<a href="https://colab.research.google.com/github/221230003-coder/221230003-pengantar-ML/blob/main/week-02/latihan_praktikum_3_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# 1. Buat dataset sample (retail)
rng = np.random.default_rng(42)
tanggal = pd.date_range(start="2023-01-01", end="2023-03-31", freq="D")

data = {
    "date": rng.choice(tanggal, size=1000),
    "product_id": rng.integers(1, 11, size=1000),     # produk 1-10
    "quantity": rng.integers(1, 10, size=1000),
    "price": rng.uniform(10, 100, size=1000),
    "customer_id": rng.integers(1, 101, size=1000)    # 100 pelanggan
}
df = pd.DataFrame(data)

# Tambahkan revenue langsung
df["revenue"] = df.eval("quantity * price")

# 1. Revenue per Product
revenue_per_product = (
    df.groupby("product_id", as_index=True)
      .agg(total_revenue=("revenue", "sum"))
      .sort_values("total_revenue", ascending=False)
      .squeeze()
)

# 2. Top 5 Customers
top_customers = (
    df.groupby("customer_id")["revenue"]
      .sum()
      .nlargest(5)
)

# 3. Daily Revenue
daily_revenue = df.pivot_table(values="revenue", index="date", aggfunc="sum")

# 4. Deteksi anomali quantity
def anomaly_detector(series, z=2):
    zscore = (series - series.mean()) / series.std()
    return zscore[np.abs(zscore) > z]

df["q_zscore"] = (df["quantity"] - df["quantity"].mean()) / df["quantity"].std()
anomalies = df.loc[np.abs(df["q_zscore"]) > 2, ["date", "product_id", "quantity", "q_zscore"]]

# OUTPUT
print("=== RETAIL DATA ANALYSIS ===")

print("\n1. Revenue per Product:")
print(revenue_per_product)

print("\n2. Top 5 Customers by Spending:")
print(top_customers)

print("\n3. Daily Revenue (first 10 days):")
print(daily_revenue.head(10).squeeze())

print(f"\n4. Quantity Anomalies detected: {len(anomalies)}")
print("Anomalies sample:")
print(anomalies.head())

# BONUS
print("\n=== BONUS ANALYSIS ===")

# Monthly Revenue
df["month"] = df["date"].dt.month
monthly_revenue = df.groupby("month")["revenue"].sum()
print("Monthly Revenue:")
print(monthly_revenue)

# Customer Statistics
customer_stats = (
    df.groupby("customer_id")
      .agg(
          revenue_sum=("revenue", "sum"),
          revenue_mean=("revenue", "mean"),
          transaksi=("revenue", "count"),
          qty_mean=("quantity", "mean")
      )
      .round(2)
)
print("\nCustomer Statistics (first 5):")
print(customer_stats.head())

# Assertions
assert revenue_per_product.shape[0] <= 10
assert top_customers.shape[0] == 5
assert not daily_revenue.empty

print("\nSemua assertions berhasil!")


=== RETAIL DATA ANALYSIS ===

1. Revenue per Product:
product_id
9     32291.089134
4     29238.074459
1     28797.303326
10    28582.900472
3     28422.121411
2     26919.385381
8     26527.116103
6     25596.419205
7     25307.624342
5     23850.015506
Name: total_revenue, dtype: float64

2. Top 5 Customers by Spending:
customer_id
25    5586.683852
53    5453.190749
92    4718.510616
26    4650.925000
55    4525.282227
Name: revenue, dtype: float64

3. Daily Revenue (first 10 days):
date
2023-01-01    2561.232596
2023-01-02    3504.388925
2023-01-03    2217.126506
2023-01-04    2879.696562
2023-01-05    4010.232610
2023-01-06    1872.300564
2023-01-07    1008.276507
2023-01-08    3428.786914
2023-01-09    7166.167796
2023-01-10    2412.544279
Name: revenue, dtype: float64

4. Quantity Anomalies detected: 0
Anomalies sample:
Empty DataFrame
Columns: [date, product_id, quantity, q_zscore]
Index: []

=== BONUS ANALYSIS ===
Monthly Revenue:
month
1    93394.126349
2    89288.482801
3   