## Load the segmented data

In [1]:
# Import necessary libraries
import pandas as pd

# Load the segmented dataset
data_path = '/Users/agalyaayyadurai/PycharmProjects/Customer-Value-Insights/data/processed/segmented_data.csv'
data = pd.read_csv(data_path)

# Display the first few rows of the dataset
print(data.head())

     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  AcceptedCmp2  Complain  Z_CostContact  \
0  04-09-2012       58       635  ...             0         0              3   
1  08-03-2014       38        11  ...             0         0              3   
2  21-08-2013       26       426  ...             0         0              3   
3  10-02-2014       26        11  ...             0         0              3   
4  19-01-2014       94       173  ...             0         0              3   

   Z_Revenue  Response  Age  Total_Children  Total

## Prepare data for modelling

In [2]:
from sklearn.model_selection import train_test_split

# Define features and target variable
features = data[['Age', 'Total_Spend', 'Total_Children', 'Income', 'Recency', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']]
target = data['Response']

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Education', 'Marital_Status'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

## Develop and Evaluate the Propensity Model

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import joblib

# Define the model and parameters
model = LogisticRegression(max_iter=10000)
param_grid = {'C': [0.1, 1, 10, 100]}

# Perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'AUC-ROC: {auc}')

# Save the trained propensity model
model_path = '/Users/agalyaayyadurai/PycharmProjects/Customer-Value-Insights/models/propensity_model_updated.pkl'
joblib.dump(best_model, model_path)


Accuracy: 0.84375
Precision: 0.4782608695652174
Recall: 0.15942028985507245
AUC-ROC: 0.5638790103628925


['/Users/agalyaayyadurai/PycharmProjects/Customer-Value-Insights/models/propensity_model_updated.pkl']