In [1]:
# 03_churn_modeling.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load cleaned data
df = pd.read_csv('../data/customer_behavior_clean.csv')

In [3]:
df['churned'].head()

0    Yes
1     No
2     No
3    Yes
4     No
Name: churned, dtype: object

In [4]:
# Encode target
df['churned'] = df['churned'].map({'Yes':1,'No':0})
df['churned'].head()

0    1
1    0
2    0
3    1
4    0
Name: churned, dtype: int64

In [5]:
# Features & target
X = df.drop(columns='churned')
y = df['churned']

In [6]:
# Preprocessing: scale numeric, encode categorical
preprocess = ColumnTransformer([
    ('num', StandardScaler(), ['age','annual_income','spending_score']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['gender','purchase_history'])
])

In [12]:
# Models to train
models = {
    "LogisticRegression": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42)
}

In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)


In [15]:
# Train & evaluate
results = []
cms = {}
for name, model in models.items():
    pipe = Pipeline([('prep', preprocess), ('clf', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    cms[name] = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append([name, report['accuracy'], report['1']['precision'],
                    report['1']['recall'], report['1']['f1-score']])
# Results 
results_df = pd.DataFrame(results, columns=['Model','Accuracy','Precision','Recall','F1']).sort_values('F1', ascending=False)
results_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
2,DecisionTree,0.666667,0.388889,0.333333,0.358974
3,RandomForest,0.706667,0.444444,0.190476,0.266667
1,KNN,0.706667,0.4,0.095238,0.153846
0,LogisticRegression,0.72,0.0,0.0,0.0


In [21]:
# Best model with confusion matrix
best_model_name = results_df.iloc[0]['Model']
best_model_name, cms[best_model_name]

('DecisionTree',
 array([[43, 11],
        [14,  7]]))

In [18]:
# Feature importance/coefficients for best model if available

# Get the best model from previous evaluation
best_model = models[best_model_name]

# Build a pipeline consisting of two steps:
# 'prep' applies preprocessing transformations,
# 'clf' is the final classifier model
# Pipeline([('step_name1', transformer1), ('step_name2', estimator2)])
pipe = Pipeline([('prep', preprocess), ('clf', best_model)])
pipe.fit(X_train, y_train)

# Extract the OneHotEncoder transformer for categorical features from the preprocessing step
ohe = pipe.named_steps['prep'].named_transformers_['cat']
cat_features = ohe.get_feature_names_out(['gender', 'purchase_history'])
num_features = ['age','annual_income','spending_score']

# Combine numeric and encoded categorical feature names into one list
all_features = list(num_features) + list(cat_features)

# Get the trained classifier model from the pipeline
model = pipe.named_steps['clf']

# Check if the model has attribute for feature importance (usually for tree-based models)
if hasattr(model, 'feature_importances_'):
    values = model.feature_importances_  # get importance values

# If not, check if model has coefficients attribute (like logistic regression)
elif hasattr(model, 'coef_'):
    values = np.abs(model.coef_[0])  # take absolute coefficients as importance

# If neither attribute exists (e.g. KNN), assign NaN for all features
else:
    values = [np.nan] * len(all_features)

# Create a pandas DataFrame with features and their importance values
feature_ranking = (
    pd.DataFrame({'Feature': all_features, 'Importance': values})
    .sort_values('Importance', ascending=False)  # sort by importance descending
    .reset_index(drop=True)  # reset index after sorting
)

# Display the top 10 features by importance
feature_ranking.head(5)

Unnamed: 0,Feature,Importance
0,annual_income,0.315873
1,age,0.281497
2,spending_score,0.265041
3,purchase_history_Groceries,0.039123
4,purchase_history_Electronics,0.032104


In [23]:
#Saving Models
import joblib

best_model = Pipeline([
    ('prep', preprocess),
    ('clf', DecisionTreeClassifier(random_state=42))
])

best_model.fit(X_train, y_train)

joblib.dump(best_model, '../models/churn_model.pkl')


['../models/churn_model.pkl']

In [None]:
"""
3. Churn Modeling 
The objective of this notebook is to predict customer churn using supervised machine learning models.
Data Preparation:
    •	The cleaned dataset was loaded, and the target column churned was encoded as 1 (Yes) and 0 (No).
    •	Features included age, annual_income, spending_score, gender, and purchase_history.
    •	Numeric features were standardized, and categorical features were one-hot encoded.
    •	Data was split into training (75%) and test (25%) sets with stratification to preserve class distribution.
Model Training and Evaluation:
    •	Four models were trained: Logistic Regression, K-Nearest Neighbors (KNN), Decision Tree, and Random Forest.
    •	Models were evaluated using Accuracy, Precision, Recall, and F1-score for the churned class.
Results Summary:
    •	Decision Tree: F1 = 0.36, Precision = 0.39, Recall = 0.33 – best balance for detecting churned customers.
    •	Random Forest: Higher overall accuracy (0.71) but low F1 (0.27) for churned class.
    •	KNN: Low recall (0.095), missing most churned cases.
    •	Logistic Regression: Failed to identify churned customers (F1 = 0), likely due to non-linear feature relationships.
Conclusion:
The Decision Tree is the most suitable model for churn prediction in this dataset, as it effectively captures non-linear patterns and identifies at-risk customers for retention strategies.

"""