In [35]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-3.0.5


In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
N = 25000 #Total number of rows
print(np.random.seed(42))

None


In [3]:
age = np.random.randint(18, 75, size=N)
print(age)
print(len(age))

[56 69 46 ... 74 54 73]
25000


In [4]:
income = np.random.normal(60000, 30000, size=N).clip(5000, 300000)
print(income)
print(len(income))


[ 86669.63247551 111915.53855484  76198.52702265 ... 106024.36710153
  70050.40440022  66755.774414  ]
25000


In [5]:
location = np.random.choice(['Urban','Suburban','Rural'], size=N, p=[0.5,0.35,0.15])
print(location)
print(len(location))

['Urban' 'Urban' 'Rural' ... 'Urban' 'Rural' 'Urban']
25000


In [6]:
segment = np.random.choice(['SME','Retail','Corporate'], size=N, p=[0.2,0.6,0.2])

In [7]:
logins_per_month = np.random.gamma(2, 3, size=N)

In [8]:
avg_order_size = np.random.exponential(100, size=N).clip(5, 2000)

In [9]:
purchase_frequency_per_month = np.random.poisson(1.5, size=N).astype(float)

In [10]:
promo_response_rate = np.random.beta(2,6, size=N)

In [11]:
tenure_months = np.random.randint(1, 120, size=N)

In [12]:
total_spend_last_year = (avg_order_size * purchase_frequency_per_month * 12) * (1 + np.random.normal(0,0.3,size=N))

In [13]:
outstanding_balance = np.random.exponential(200, size=N)


In [14]:
discount_usage_rate = np.random.beta(2,5, size=N)

In [15]:
customer_service_interactions = np.random.poisson(0.7, size=N)

In [16]:
app_activity_score = np.random.normal(50, 20, size=N).clip(0,100)

In [17]:
churn_risk_score = np.clip(0.6 - (app_activity_score - 50)/200 - (tenure_months/240) + np.random.normal(0,0.05,size=N), 0,1)

In [18]:
base_clv = (
    0.002 * income +
    1.5 * total_spend_last_year +
    50 * (segment == 'Corporate').astype(float) +
    30 * (segment == 'SME').astype(float) +
    20 * app_activity_score +
    -400 * churn_risk_score +
    2 * tenure_months -
    10 * discount_usage_rate * total_spend_last_year
)


In [19]:
noise = np.random.normal(0, base_clv.std() * 0.25, size=N)
CLV_12m = (base_clv + noise).clip(0)

In [20]:
df = pd.DataFrame({
    'age': age,
    'income': income,
    'location': location,
    'segment': segment,
    'logins_per_month': logins_per_month,
    'avg_order_size': avg_order_size,
    'purchase_frequency_per_month': purchase_frequency_per_month,
    'promo_response_rate': promo_response_rate,
    'tenure_months': tenure_months,
    'total_spend_last_year': total_spend_last_year,
    'outstanding_balance': outstanding_balance,
    'discount_usage_rate': discount_usage_rate,
    'customer_service_interactions': customer_service_interactions,
    'app_activity_score': app_activity_score,
    'churn_risk_score': churn_risk_score,
    'CLV_12m': CLV_12m
})


In [21]:
df.head()

Unnamed: 0,age,income,location,segment,logins_per_month,avg_order_size,purchase_frequency_per_month,promo_response_rate,tenure_months,total_spend_last_year,outstanding_balance,discount_usage_rate,customer_service_interactions,app_activity_score,churn_risk_score,CLV_12m
0,56,86669.632476,Urban,Retail,2.119058,415.586555,0.0,0.166604,55,0.0,158.126026,0.345317,1,38.404764,0.430753,0.0
1,69,111915.538555,Urban,Retail,3.07099,42.247099,2.0,0.112674,110,596.023611,371.90324,0.411775,0,60.453654,0.037685,2006.294805
2,46,76198.527023,Rural,SME,3.027881,24.772901,1.0,0.321444,37,508.639169,77.956338,0.417457,1,47.01306,0.39364,0.0
3,32,47100.257742,Urban,Retail,8.785846,47.512154,0.0,0.314821,69,0.0,289.671398,0.432579,0,17.165017,0.445838,1567.938558
4,60,99700.227675,Urban,Retail,4.043159,61.533833,1.0,0.401783,88,674.23465,66.475289,0.47205,2,72.57969,0.143687,0.0


In [22]:
# -----------------------------
# 3. Train/test split
# -----------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [23]:
# -----------------------------
# 4. Preprocessing & model pipeline
# -----------------------------
num_cols = ['age','income','logins_per_month','avg_order_size','purchase_frequency_per_month',
            'tenure_months','total_spend_last_year','outstanding_balance','discount_usage_rate',
            'customer_service_interactions','app_activity_score','promo_response_rate','churn_risk_score']
cat_cols = ['location','segment']

In [24]:
# Transformers
num_transformer = Pipeline([('scaler', StandardScaler())])
cat_transformer = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])


In [25]:
# XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)
pipe = Pipeline([('pre', preprocessor), ('model', xgb_model)])

In [26]:
# Hyperparameter grid (simple example)
param_grid = {
    'model__n_estimators': [100, 300],
    'model__max_depth': [4, 8],
    'model__learning_rate': [0.05, 0.1]
}

cv = KFold(n_splits=3, shuffle=True, random_state=42)

gs = GridSearchCV(pipe, param_grid, cv=cv, scoring='r2', n_jobs=-1, verbose=1)
gs.fit(train_df[num_cols + cat_cols], train_df['CLV_12m'])

best_model = gs.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [27]:
# -----------------------------
# 5. Predict on test data
# -----------------------------
preds = best_model.predict(test_df[num_cols + cat_cols])

r2 = r2_score(test_df['CLV_12m'], preds)
# rmse = mean_squared_error(test_df['CLV_12m'], preds, squared=False)
rmse = np.sqrt(mean_squared_error(test_df['CLV_12m'], preds))
print('RMSE:', rmse)

mae = mean_absolute_error(test_df['CLV_12m'], preds)

print('R²:', r2)
print('RMSE:', rmse)
print('MAE:', mae)

RMSE: 1166.7580650263262
R²: 0.5457574276764399
RMSE: 1166.7580650263262
MAE: 833.6646333563045


In [28]:

# -----------------------------
# 6. Segment into CLV tiers
# -----------------------------
# Quantile-based segmentation: Low=bottom50%, Medium=50-80%, High=top20%
pred_tiers = pd.qcut(preds, q=[0,0.5,0.8,1.0], labels=['Low','Medium','High'])
test_df['predicted_CLV_tier'] = pred_tiers

print(test_df[['CLV_12m','predicted_CLV_tier']].head())

           CLV_12m predicted_CLV_tier
6868      0.000000                Low
24016   435.682089             Medium
9668      0.000000                Low
13640  2387.756480               High
14018  3602.651240               High


In [29]:
# -----------------------------
# 7. Optional: business recommendation (example)
# -----------------------------
def recommend_action(row):
    if row['predicted_CLV_tier']=='High' and row['churn_risk_score']>0.5:
        return 'Retention Priority'
    elif row['predicted_CLV_tier'] in ['High','Medium']:
        return 'Upsell/Cross-sell'
    else:
        return 'Low-cost Nurture'

test_df['action'] = test_df.apply(recommend_action, axis=1)
print(test_df[['predicted_CLV_tier','churn_risk_score','action']].head())

      predicted_CLV_tier  churn_risk_score             action
6868                 Low          0.383933   Low-cost Nurture
24016             Medium          0.287611  Upsell/Cross-sell
9668                 Low          0.208142   Low-cost Nurture
13640               High          0.094464  Upsell/Cross-sell
14018               High          0.288104  Upsell/Cross-sell


In [34]:
# -----------------------------
# Train Linear Regression
# -----------------------------
lr_pipe = Pipeline([('pre', preprocessor),
                    ('model', LinearRegression())])
lr_pipe.fit(train_df[num_cols + cat_cols], train_df['CLV_12m'])
preds_lr = lr_pipe.predict(test_df[num_cols + cat_cols])

# -----------------------------
# Train Random Forest
# -----------------------------
rf_pipe = Pipeline([('pre', preprocessor),
                    ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])
rf_pipe.fit(train_df[num_cols + cat_cols], train_df['CLV_12m'])
preds_rf = rf_pipe.predict(test_df[num_cols + cat_cols])

# -----------------------------
# Compare all three models
# -----------------------------
def evaluate(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    # rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"\n📊 {name}")
    print(f"R²: {r2:.3f}")
    # print(f"RMSE: {rmse:,.2f}")
    rmse = np.sqrt(mean_squared_error(test_df['CLV_12m'], preds))
    print('RMSE:', rmse)
    print(f"MAE: {mae:,.2f}")

evaluate("XGBoost", test_df['CLV_12m'], preds)
evaluate("Linear Regression", test_df['CLV_12m'], preds_lr)
evaluate("Random Forest", test_df['CLV_12m'], preds_rf)


📊 XGBoost
R²: 0.546
RMSE: 1166.7580650263262
MAE: 833.66

📊 Linear Regression
R²: 0.153
RMSE: 1166.7580650263262
MAE: 1,127.39

📊 Random Forest
R²: 0.531
RMSE: 1166.7580650263262
MAE: 836.85
