In [1]:
# 01_preprocess.ipynb

import os
import pandas as pd
import numpy as np

# ตั้ง path ให้ตรงกับโครงโฟลเดอร์ที่สร้าง
BASE_DIR = "."
RAW_PATH = os.path.join(BASE_DIR, "data", "raw", "fashion_boutique_dataset.csv")
PROCESSED_DIR = os.path.join(BASE_DIR, "data", "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

RAW_PATH, PROCESSED_DIR


('./data/raw/fashion_boutique_dataset.csv', './data/processed')

In [2]:
df = pd.read_csv(RAW_PATH)

print(df.shape)
df.head()


(2176, 14)


Unnamed: 0,product_id,category,brand,season,size,color,original_price,markdown_percentage,current_price,purchase_date,stock_quantity,customer_rating,is_returned,return_reason
0,FB000001,Outerwear,Zara,Spring,XL,Red,196.01,0.0,196.01,2025-07-05,37,3.0,False,
1,FB000002,Tops,Uniqlo,Winter,L,Pink,119.64,0.0,119.64,2025-08-06,2,2.5,False,
2,FB000003,Accessories,Uniqlo,Winter,,Black,33.8,0.0,33.8,2025-08-06,22,4.3,False,
3,FB000004,Shoes,Uniqlo,Spring,XL,Black,75.36,0.0,75.36,2025-07-07,48,2.6,False,
4,FB000005,Tops,Banana Republic,Winter,XL,Black,105.02,0.0,105.02,2025-08-06,10,,False,


In [3]:
#Fix missing issue
# แปลงวันที่
df["purchase_date"] = pd.to_datetime(df["purchase_date"])

# เติม missing ที่สำคัญ
df["customer_rating"] = df["customer_rating"].fillna(df["customer_rating"].median())
df["markdown_percentage"] = df["markdown_percentage"].fillna(0.0)

# ถ้า size มี NaN เราให้เป็น "Unknown"
df["size"] = df["size"].fillna("Unknown")

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2176 entries, 0 to 2175
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   product_id           2176 non-null   object        
 1   category             2176 non-null   object        
 2   brand                2176 non-null   object        
 3   season               2176 non-null   object        
 4   size                 2176 non-null   object        
 5   color                2176 non-null   object        
 6   original_price       2176 non-null   float64       
 7   markdown_percentage  2176 non-null   float64       
 8   current_price        2176 non-null   float64       
 9   purchase_date        2176 non-null   datetime64[ns]
 10  stock_quantity       2176 non-null   int64         
 11  customer_rating      2176 non-null   float64       
 12  is_returned          2176 non-null   bool          
 13  return_reason        320 non-null

In [4]:
#feature zone

# revenue ต่อ row (ขายได้ประมาณเท่าไหร่)
df["line_revenue"] = df["current_price"] * df["stock_quantity"]

# สร้างฟีเจอร์จากวันที่
df["dayofweek"] = df["purchase_date"].dt.dayofweek  # จันทร์=0 ... อาทิตย์=6
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
df["month"] = df["purchase_date"].dt.month
df["year"] = df["purchase_date"].dt.year


In [5]:
daily = (
    df.groupby("purchase_date")
      .agg(
          total_qty=("stock_quantity", "sum"),          # demand (จำนวนชิ้นทั้งหมดต่อวัน)
          total_revenue=("line_revenue", "sum"),
          avg_discount=("markdown_percentage", "mean"),
          avg_rating=("customer_rating", "mean"),
      )
      .reset_index()
      .sort_values("purchase_date")
)

# เติมฟีเจอร์เวลาอีกทีจาก date
daily["dayofweek"] = daily["purchase_date"].dt.dayofweek
daily["is_weekend"] = (daily["dayofweek"] >= 5).astype(int)
daily["month"] = daily["purchase_date"].dt.month
daily["year"] = daily["purchase_date"].dt.year

daily.head()


Unnamed: 0,purchase_date,total_qty,total_revenue,avg_discount,avg_rating,dayofweek,is_weekend,month,year
0,2024-08-06,36,1324.8,43.0,3.9,1,0,8,2024
1,2024-08-07,44,2412.92,16.7,3.2,2,0,8,2024
2,2024-08-08,44,5467.44,0.0,4.0,3,0,8,2024
3,2024-08-10,29,738.05,0.0,3.7,5,1,8,2024
4,2024-08-12,115,12813.79,14.866667,2.233333,0,0,8,2024


In [6]:
OUT_PATH = os.path.join(PROCESSED_DIR, "daily_store_sales.csv")
daily.to_csv(OUT_PATH, index=False)
OUT_PATH


'./data/processed/daily_store_sales.csv'