# Predicting Customer Churn

**Business Use Cases:**

- Growth Planning: Identifying potential churn enables proactive efforts to retain customers, ensuring a steady customer base for business expansion.

- Demand Shaping: Predicting churn helps tailor incentives and experiences to maintain demand for products or services.

- Sales Planning: Churn prediction allows sales teams to engage with at-risk customers, offering solutions to retain them.

- Offers Rollout: Businesses can target churn-prone customers with special deals to encourage them to stay.


In [1]:
from datetime import datetime
import pandas as pd
from feast import FeatureStore

In [None]:
# Initialize a Feast FeatureStore
store = FeatureStore(repo_path=".")


# Entity

In [42]:
# Create a DataFrame to represent entity data
entity_df = pd.DataFrame.from_dict(
    {
        "customer_id": [1, 2, 3, 4, 230],  # List of customer IDs
        "event_timestamp": [  # List of event timestamps
            datetime(1992, 4, 2, 10, 59, 42),
            datetime(1992, 4, 2, 8, 12, 10),
            datetime(1992, 4, 2, 16, 40, 26),
            datetime(1992, 4, 2, 15, 1, 12),
            datetime(1992, 4, 4, 15, 1, 12),
        ],
    }
)

# Ensure that the 'customer_id' column is of type int32
entity_df['customer_id'] = entity_df['customer_id'].astype('int32')


In [43]:
entity_df.head()

Unnamed: 0,customer_id,event_timestamp
0,1,1992-04-02 10:59:42
1,2,1992-04-02 08:12:10
2,3,1992-04-02 16:40:26
3,4,1992-04-02 15:01:12
4,230,1992-04-04 15:01:12


In [None]:
entity_df.info()

# Offline Retrieval

In [45]:
# Specify the features you want to retrieve for the provided entity data
feature_refs = [
    "customer_hourly_stats:category",
    "customer_hourly_stats:order_gmv",
    "customer_hourly_stats:sex",
]

# Use the FeatureStore to fetch historical features for the specified entity data and features
training_df = store.get_historical_features(
    entity_df=entity_df,
    feature_refs=feature_refs,
).to_df()

In [46]:
training_df.head()

Unnamed: 0,event_timestamp,customer_id,customer_hourly_stats__category,customer_hourly_stats__order_gmv,customer_hourly_stats__sex
0,1992-04-02 08:12:10+00:00,2,1,71.283302,female
1,1992-04-02 10:59:42+00:00,1,3,7.25,male
2,1992-04-02 15:01:12+00:00,4,1,53.099998,female
3,1992-04-02 16:40:26+00:00,3,3,7.925,female
4,1992-04-04 15:01:12+00:00,230,3,25.4667,female


# Online Retrieval

In [None]:
# Fetch online features from the Feast FeatureStore

# Specify the feature references you want to retrieve for the provided entity data
feature_refs = [
    "customer_hourly_stats:category",
    "customer_hourly_stats:order_gmv",
    "customer_hourly_stats:sex",
]

# Specify a list of entity rows for which you want to retrieve online features
entity_rows = [
    {"customer_id": 3},
    {"customer_id": 2},
    {"customer_id": 82323},
]

# Use the FeatureStore to fetch online features for the specified feature references and entity rows
feature_vector = store.get_online_features(
    feature_refs=feature_refs,
    entity_rows=entity_rows,
).to_dict()


In [48]:
feature_vector

{'customer_id': [3, 2, 82323],
 'customer_hourly_stats__sex': ['female', 'female', None],
 'customer_hourly_stats__category': ['3', '1', None],
 'customer_hourly_stats__order_gmv': [7.925000190734863,
  71.2833023071289,
  None]}

# Training Data Generation

In [49]:
# Create a DataFrame to represent entity data

entity_df = pd.DataFrame.from_dict(
    {
        "customer_id": [cust_id for cust_id in range(1, 892)],  # Generate a range of customer IDs
        "event_timestamp": [  # List of event timestamps
            datetime(1992, 5, 1, 0, 0, 0) for cust_id in range(1, 892)
        ],
    }
)

# Ensure that the 'customer_id' column is of type int32
entity_df['customer_id'] = entity_df['customer_id'].astype('int32')


In [50]:
entity_df.shape

(891, 2)

In [51]:
# Fetch historical features from the Feast FeatureStore

# Specify the features you want to retrieve for the provided entity data
feature_refs = [
    "customer_hourly_stats:category",
    "customer_hourly_stats:order_gmv",
    "customer_hourly_stats:sex",
    "customer_hourly_stats:age",
    "customer_hourly_stats:credit_type",
    "customer_hourly_stats:churned",
]

# Use the FeatureStore to fetch historical features for the specified entity data and features
training_df = store.get_historical_features(
    entity_df=entity_df,
    feature_refs=feature_refs,
).to_df()

# Remove rows with missing values (NaN)
training_df.dropna(inplace=True)


In [None]:
training_df.head(10)

In [None]:
training_df.shape

# Model training

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

In [55]:
# Import necessary libraries and modules
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define a function to calculate AUC (Area Under the Curve) for ROC (Receiver Operating Characteristic)
def get_auc(labels, scores):
    fpr, tpr, thresholds = roc_curve(labels, scores)
    auc_score = auc(fpr, tpr)
    return fpr, tpr, auc_score

# Define a function to plot a metric
def plot_metric(ax, x, y, x_label, y_label, plot_label, style="-"):
    ax.plot(x, y, style, label=plot_label)
    ax.legend()
    
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)

# Define a function to summarize predictions and plot ROC curve
def prediction_summary(labels, predicted_score, info, plot_baseline=True, axes=None):
    if axes is None:
        axes = [plt.subplot(1, 2, 1)]

    fpr, tpr, auc_score = get_auc(labels, predicted_score)
    plot_metric(axes[0], fpr, tpr, "False positive rate",
                "True positive rate", "{} AUC = {:.4f}".format(info, auc_score))
    if plot_baseline:
        plot_metric(axes[0], [0, 1], [0, 1], "False positive rate",
                "True positive rate", "baseline AUC = 0.5", "r--")

    plt.show()
    return axes, auc_score

# Define a function to create a figure for plotting
def figure():
    fig_size = 4.5
    f = plt.figure()
    f.set_figheight(fig_size)
    f.set_figwidth(fig_size*2)


In [56]:
# Split the 'training_df' DataFrame into training and testing sets
pdf_train, pdf_test = train_test_split(training_df, test_size=0.2, random_state=123)

# Print the shapes (number of rows and columns) of the training and testing sets
print(pdf_train.shape, pdf_test.shape)


(711, 8) (178, 8)


In [57]:
# List of feature names
features = [
    'customer_hourly_stats__category',
    'customer_hourly_stats__order_gmv',
    'customer_hourly_stats__sex',
    'customer_hourly_stats__age',
    'customer_hourly_stats__credit_type'
]


In [None]:
# Extract the features from the training and testing datasets
X_train = pdf_train[features]
X_train['is_train'] = 1  # Add a binary flag 'is_train' with a value of 1 to identify the training data

X_test = pdf_test[features]
X_test['is_train'] = 0  # Add a binary flag 'is_train' with a value of 0 to identify the testing data

# Combine the training and testing datasets
X = pd.concat([X_train, X_test])

# Perform one-hot encoding (get dummies) on the combined dataset
X = pd.get_dummies(data=X)

# Split the combined dataset back into training and testing datasets
X_train = X[X.is_train == 1]
X_test = X[X.is_train == 0]

# Drop the 'is_train' column as it's no longer needed
X_train.drop(['is_train'], axis=1, inplace=True)
X_test.drop(['is_train'], axis=1, inplace=True)


In [None]:
# Create a RandomForestClassifier model with specified hyperparameters

scikit_rf = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    random_state=1234,  
    max_depth=6,  # Maximum depth of the tree
    n_jobs=-1  # Use all available CPU cores for parallel processing
)

# Fit the RandomForestClassifier model to the training data
scikit_rf.fit(X_train, pdf_train[["customer_hourly_stats__churned"]])

# Make predictions on the testing data
predictions_scikit_rf = scikit_rf.predict_proba(X_test)

# Assign the predicted probabilities of class 1 to a new column in the testing DataFrame
pdf_test['p1'] = predictions_scikit_rf[:, 1]


In [None]:
# Create a figure for plotting
figure()

# Calculate the ROC AUC score and create a summary plot
axes, auc_score = prediction_summary(pdf_test["customer_hourly_stats__churned"], pdf_test["p1"], "")


In [None]:
# Create a GradientBoostingClassifier model with specified hyperparameters

scikit_gb = GradientBoostingClassifier(
    n_estimators=50,  # Number of boosting stages (trees)
    random_state=1234, 
    max_depth=6  # Maximum depth of each tree in the ensemble
)

# Fit the GradientBoostingClassifier model to the training data
scikit_gb.fit(X_train, pdf_train[["customer_hourly_stats__churned"]])

# Make predictions on the testing data
predictions_scikit_gb = scikit_gb.predict_proba(X_test)

# Assign the predicted probabilities of class 1 to a new column in the testing DataFrame
pdf_test['p1'] = predictions_scikit_gb[:, 1]


In [None]:
# Create a figure for plotting
figure()

# Calculate the ROC AUC score and create a summary plot
axes, auc_score = prediction_summary(pdf_test["customer_hourly_stats__churned"], pdf_test["p1"], "")


# Real-time prediction

In [None]:
# Fetch online features from the Feast FeatureStore
pred_df = store.get_online_features(
    feature_refs=[
        "customer_hourly_stats:category",
        "customer_hourly_stats:order_gmv",
        "customer_hourly_stats:sex",
        "customer_hourly_stats:age",
        "customer_hourly_stats:credit_type",
        "customer_hourly_stats:churned",
    ],
    entity_rows=[
        {"customer_id": 23},
        {"customer_id": 2},
        {"customer_id": 6},
        {"customer_id": 7},
        {"customer_id": 1},
        {"customer_id": 4},
        {"customer_id": 10},
    ],
).to_df()

# Remove rows with missing values (NaN)
pred_df.dropna(inplace=True)


In [None]:
pred_df.head(10)

In [None]:
# Perform one-hot encoding on selected features in the DataFrame 'pred_df'
pred_df = pd.get_dummies(data=pred_df[features])


In [66]:
pred_df.head()

Unnamed: 0,customer_hourly_stats__order_gmv,customer_hourly_stats__age,customer_hourly_stats__category_1,customer_hourly_stats__category_2,customer_hourly_stats__category_3,customer_hourly_stats__sex_female,customer_hourly_stats__sex_male,customer_hourly_stats__credit_type_C,customer_hourly_stats__credit_type_Q,customer_hourly_stats__credit_type_S
0,8.0292,15,0,0,1,1,0,0,1,0
1,71.283302,38,1,0,0,1,0,1,0,0
2,8.4583,46,0,0,1,0,1,0,1,0
3,51.862499,54,1,0,0,0,1,0,0,1
4,7.25,22,0,0,1,0,1,0,0,1


In [None]:
# Make predictions using the RandomForestClassifier model on the 'pred_df' DataFrame
predictions = scikit_rf.predict(pred_df)


---