In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------
# Step 1: Simulate sample data
# ---------------------------
np.random.seed(42)

# Simulate 1000 customers
n_customers = 1000
customer_ids = [f"CUST{i:04d}" for i in range(n_customers)]

# Create transaction data
transactions = []
for cust in customer_ids:
    num_orders = np.random.randint(1, 10)
    first_date = datetime(2022, 1, 1)
    for _ in range(num_orders):
        date = first_date + timedelta(days=np.random.randint(0, 365))
        amount = round(np.random.uniform(20, 500), 2)
        transactions.append([cust, date, amount])

df_txn = pd.DataFrame(transactions, columns=['CustomerID', 'TransactionDate', 'Amount'])

# ---------------------------
# Step 2: Feature Engineering
# ---------------------------
today = datetime(2023, 1, 1)

# Group and calculate features
df_features = df_txn.groupby('CustomerID').agg({
    'TransactionDate': [lambda x: (today - x.max()).days,  # Recency
                        lambda x: (today - x.min()).days], # Customer Age
    'CustomerID': 'count',                                 # Frequency
    'Amount': ['sum', 'mean']                              # Total Spend, AOV
}).reset_index()

df_features.columns = ['CustomerID', 'Recency', 'CustomerAge', 'Frequency', 'TotalSpend', 'AOV']

# Simulated LTV (normally you would calculate future spend — here we simulate)
df_features['LTV'] = df_features['TotalSpend'] * np.random.uniform(0.8, 1.2, size=len(df_features))

# ---------------------------
# Step 3: Train/Test Split
# ---------------------------
X = df_features[['Recency', 'Frequency', 'AOV', 'CustomerAge', 'TotalSpend']]
y = df_features['LTV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------
# Step 4: Train XGBoost Model
# ---------------------------
model = XGBRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)

# ---------------------------
# Step 5: Evaluate Model
# ---------------------------
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# ---------------------------
# Step 6: Predict & Segment
# ---------------------------
df_features['Predicted_LTV'] = model.predict(X)

# Segment customers by quartile
df_features['Segment'] = pd.qcut(df_features['Predicted_LTV'], q=4, labels=['Low', 'Medium', 'High', 'Top'])

# ---------------------------
# Step 7: Visualizations
# ---------------------------
plt.figure(figsize=(8, 6))
sns.histplot(df_features['Predicted_LTV'], bins=30, kde=True)
plt.title("Predicted LTV Distribution")
plt.xlabel("Predicted LTV")
plt.show()

# ---------------------------
# Step 8: Export CSV
# ---------------------------
df_features[['CustomerID', 'Predicted_LTV', 'Segment']].to_csv("customer_ltv_predictions.csv", index=False)
print("✅ CSV exported: customer_ltv_predictions.csv")
