# Modelling: High Value and Satisfaction Prediction

This notebook fits logistic regression and XGBoost classifiers to predict whether a customer is high value or satisfied. It reports accuracy and ROC–AUC metrics and displays feature importances.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier

# Load cleaned data
df = pd.read_csv('../data/processed/customers_clean.csv', parse_dates=['purchase_date'])

# Define categorical and numeric columns
categorical_cols = ['gender','city','membership_type','discount_applied']
numeric_cols = ['age','total_spend','items_purchased','avg_rating','days_since_last_purchase']

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

In [None]:
# High‑value classification
X = df[categorical_cols + numeric_cols]
y = df['is_high_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_reg = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])

log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:,1]
print('High‑value accuracy:', accuracy_score(y_test, y_pred))
print('High‑value ROC‑AUC:', roc_auc_score(y_test, y_prob))
print('Confusion matrix:
', confusion_matrix(y_test, y_pred))

In [None]:
# XGBoost model for high‑value classification
xgb = Pipeline([
    ('preprocess', preprocess),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:,1]
print('XGBoost accuracy:', accuracy_score(y_test, y_pred_xgb))
print('XGBoost ROC‑AUC:', roc_auc_score(y_test, y_prob_xgb))
print('Confusion matrix:
', confusion_matrix(y_test, y_pred_xgb))

In [None]:
# Satisfaction classification
X = df[categorical_cols + numeric_cols]
y = df['satisfaction_level'].eq('Satisfied')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_reg_s = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000))
])

log_reg_s.fit(X_train, y_train)
y_pred_s = log_reg_s.predict(X_test)
y_prob_s = log_reg_s.predict_proba(X_test)[:,1]
print('Satisfaction accuracy:', accuracy_score(y_test, y_pred_s))
print('Satisfaction ROC‑AUC:', roc_auc_score(y_test, y_prob_s))
print('Confusion matrix:
', confusion_matrix(y_test, y_pred_s))

The logistic regression and XGBoost classifiers achieve very high accuracy and ROC–AUC scores on this small dataset. This indicates that the problem is easy to separate given the provided features. In a production setting you would validate models on a larger, independent dataset and consider additional features to avoid overfitting.