In [4]:
# 1. IMPORT
import pandas as pd
from datetime import timedelta
from sklearn.model_selection import train_test_split

# 2. LOAD DATA
orders = pd.read_csv('../data/olist_orders_dataset.csv')
customers = pd.read_csv('../data/olist_customers_dataset.csv')
payments = pd.read_csv('../data/olist_order_payments_dataset.csv')

# Gabungkan order + customer
orders_customers = orders.merge(customers, on='customer_id', how='left')
orders_customers['order_purchase_timestamp'] = pd.to_datetime(orders_customers['order_purchase_timestamp'])

# Gabungkan payment
full_data = orders_customers.merge(payments[['order_id', 'payment_value']], on='order_id', how='left')

# 3. SPLIT BERDASARKAN CUSTOMER
unique_ids = full_data['customer_unique_id'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)

train_data = full_data[full_data['customer_unique_id'].isin(train_ids)].copy()
test_data = full_data[full_data['customer_unique_id'].isin(test_ids)].copy()

# 4. GUNAKAN CUTOFF DARI TRAINING SET
cutoff = train_data['order_purchase_timestamp'].max() + timedelta(days=1)


In [5]:
# 5. HITUNG RFM (HANYA DARI TRAINING)
rfm_train = train_data.groupby('customer_unique_id').agg({
    'order_purchase_timestamp': lambda x: (cutoff - x.max()).days,
    'order_id': 'nunique',
    'payment_value': 'sum'
}).reset_index()

rfm_train.columns = ['customer_unique_id', 'recency', 'frequency', 'monetary']

# 6. EXPORT
rfm_train.to_csv('../data/rfm_features_train.csv', index=False)