In [1]:
import numpy as np
import pandas as pd
import json
from flask import Flask, jsonify
from flask_cors import CORS
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as XGB

In [26]:
# Read in the tuned hyperparameters
try:
    # Open the JSON file
    with open('Models.json', 'r') as file:
        # Load the models
        models = json.load(file)
    # File is successfully opened and loaded
    print(models)
except FileNotFoundError:
    # Handle the FileNotFoundError
    print("The file 'Models.json' does not exist or cannot be opened.")

[[{'C': 1000, 'penalty': 'l1', 'solver': 'saga'}, {'C': 10, 'penalty': 'l2', 'solver': 'saga'}, {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}, {'C': 1, 'penalty': 'l1', 'solver': 'saga'}], [{'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 100, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'max_depth': 6, 'min_child_weight': 10, 'gamma': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8, 'lambda': None, 'alpha': None}, {'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 100, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'max_depth': 6, 'min_child_weight': 10, 'gamma': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8, 'lambda': None, 'alpha': None}, {'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 100, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'max_depth': 6, 'min_child_weight': 10, 'gamma': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8, 'lambda': None, 'alpha': None}, {'booster': 'gbtree', 'learning_rate': 0.1, 'n_est

In [3]:
# Read in the list of columns that were not used to train
try:
    # Open the JSON file
    with open('FeatureDropped.json', 'r') as file:
        # Load the models
        features_dropped = json.load(file)
    # File is successfully opened and loaded
    print(features_dropped)
except FileNotFoundError:
    # Handle the FileNotFoundError
    print("The file 'FeatureDropped.json' does not exist or cannot be opened.")

['Gender', 'ChurnDate', 'FirstPersona', 'SecondPersona', 'ThirdPersona', 'CombinedPersonas', 'CurrLifecycle', 'SocialInfluencer']


In [4]:
# Read in the train and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
input_features = [column for column in train_data.columns if column not in features_dropped]
print("Train: ", train_data.head())
print("\n")
print("Test: ", test_data.head())

Train:     Gender  Age  Tenure    Balance  NumOfProducts  EstimatedSalary   ChurnDate  \
0    Male   38      15  158584.28              1         76640.29         NaN   
1  Female   36       3   94216.05              1        179128.69  2024-03-28   
2  Female   35      11       0.00              2         19842.91         NaN   
3  Female   26      11  145980.23              1        131804.86  2023-09-08   
4    Male   34       1       0.00              2         10063.75         NaN   

   TransactionFreq  TransactionAmt  ServiceSupportFrequency  ...  \
0        15.092344   145925.493884                        8  ...   
1         9.192966    73541.740488                       35  ...   
2        29.023334    68936.936250                        7  ...   
3        29.562389    53097.327051                        3  ...   
4        53.452905     4434.315367                        8  ...   

   CurrLifecycle  Happiness Savings Savant Digital Dynamos Trustee Tribe  \
0         Active    

In [5]:
# Remove this after then has added this into his code
train_data['Active'] = (train_data['CurrLifecycle'] == 'Active').astype(int)
train_data['Reactivated'] = (train_data['CurrLifecycle'] == 'Reactivated').astype(int)
train_data['Dormant'] = (train_data['CurrLifecycle'] == 'Dormant').astype(int)
train_data['Churned'] = (train_data['CurrLifecycle'] == 'Churned').astype(int)
test_data['Active'] = (test_data['CurrLifecycle'] == 'Active').astype(int)
test_data['Reactivated'] = (test_data['CurrLifecycle'] == 'Reactivated').astype(int)
test_data['Dormant'] = (test_data['CurrLifecycle'] == 'Dormant').astype(int)
test_data['Churned'] = (test_data['CurrLifecycle'] == 'Churned').astype(int)

In [6]:
# Encode categorical variables for logistic regression
label_encoder = LabelEncoder()

for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column] = label_encoder.fit_transform(train_data[column])
        test_data[column] = label_encoder.fit_transform(test_data[column])

In [7]:
# Copy data sets to do encoding and scaling for logistic regression
train_lgr = train_data.copy()
test_lgr = test_data.copy()

In [8]:
# Scale dataset for logistic regression
scaler = StandardScaler()

train_lgr_features = train_lgr[[column for column in train_lgr.columns if column not in ['Active', 'Reactivated', 'Dormant', 'Churned']]]
train_lgr_labels = train_lgr[[column for column in train_lgr.columns if column in ['Active', 'Reactivated', 'Dormant', 'Churned']]]

train_lgr_features = pd.DataFrame(scaler.fit_transform(train_lgr_features), columns = train_lgr_features.columns)
train_lgr = pd.concat([train_lgr_features, train_lgr_labels], axis = 1)


test_lgr_features = test_lgr[[column for column in test_lgr.columns if column not in ['Active', 'Reactivated', 'Dormant', 'Churned']]]
test_lgr_labels = test_lgr[[column for column in test_lgr.columns if column in ['Active', 'Reactivated', 'Dormant', 'Churned']]]

test_lgr_features = pd.DataFrame(scaler.fit_transform(test_lgr_features), columns = test_lgr_features.columns)
test_lgr = pd.concat([test_lgr_features, test_lgr_labels], axis = 1)

In [27]:
# Separate the lgr, xgb, and rf parameters into
# a list of 4 dictionaries with key value pairs of hyperparamters for Active, Reactivated, Dormant, and Churned Classification
lgr = models[0]
xgb = models[1]
rf = models[2]

In [10]:
y_train_all_lgr = [train_lgr['Active'], train_lgr['Reactivated'], train_lgr['Dormant'], train_lgr['Churned']]
y_test_all_lgr = [test_lgr['Active'], test_lgr['Reactivated'], test_lgr['Dormant'], test_lgr['Churned']]

y_train_all = [train_data['Active'], train_data['Reactivated'], train_data['Dormant'], train_data['Churned']]
y_test_all = [test_data['Active'], test_data['Reactivated'], test_data['Dormant'], test_data['Churned']]

In [18]:
np.unique(y_train_all_lgr[3])

array([0, 1])

In [28]:
lgr_models_list = []
xgb_models_list = []
rf_models_list = []
for i in range(4):
    # Train the lgr models and store in the list
    lgr_model = LogisticRegression(**lgr[i])
    lgr_model.fit(train_lgr[input_features], y_train_all_lgr[i])
    lgr_models_list.append(lgr_model)
    
    # Train the xgb models and store in the list
#     dtrain = XGB.DMatrix(data=train_data[input_features], label=y_train_all[i], enable_categorical=True)
#     xgb_model = XGB.train(xgb[i], dtrain)
    print(i)
    xgb_model = XGB.XGBClassifier(**xgb[i])
    xgb_model.fit(train_data[input_features], y_train_all[i])
    xgb_models_list.append(xgb_model)
    
    # Train the rf models and store in the list
    rf_model = RandomForestClassifier(**rf[i])
    rf_model.fit(train_data[input_features], y_train_all[i])
    rf_models_list.append(rf_model)

0
1


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [None]:
# Define the function to generate predictions from the trained models
def prediction(lgr, xgb, rf, lgr_data, features): 
    # Generate predictions for data
    for i in range(len(lgr)):
        # lgr[0], xgb[0], rf[0] is the model with binary label Active/Non-Active
        if i == 0:
            data['lgr_Active_proba'] = lgr[0].predict_proba(features)
            data['xgb_Active_proba'] = xgb[0].predict_proba(features)
            data['rf_Active_proba'] = rf[0].predict_proba(features)
            
        # lgr[1], xgb[1], rf[1] is the model with binary label Reactivated/Non-Reactivated
        elif i == 1:
            data['lgr_Reactivated_proba'] = lgr[1].predict_proba(features)
            data['xgb_Reactivated_proba'] = xgb[1].predict_proba(features)
            data['rf_Reactivated_proba'] = rf[1].predict_proba(features)
            
        # lgr[2], xgb[2], rf[2] is the model with binary label Dormant/Non-Dormant
        elif i == 2:
            data['lgr_Dormant_proba'] = lgr[2].predict_proba(features)
            data['xgb_Dormant_proba'] = xgb[2].predict_proba(features)
            data['rf_Dormant_proba'] = rf[2].predict_proba(features)
           
        # lgr[3], xgb[3], rf[3] is the model with binary label Churned/Non-Churned
        elif i == 3:
            data['lgr_Churned_proba'] = lgr[3].predict_proba(features)
            data['xgb_Churned_proba'] = xgb[3].predict_proba(features)
            data['rf_Churned_proba'] = rf[3].predict_proba(features)
    
    # Calculate the average probability from the probabilities generated by each model (Ensemble Learning)
    data['average_Active_proba'] = data[['lgr_Active_proba', 'xgb_Active_proba', 'rf_Active_proba']].agg(mean, axis = 1)
    data['average_Reactivated_proba'] = data[['lgr_Reactivated_proba', 'xgb_Reactivated_proba', 'rf_Reactivated_proba']].agg(mean, axis = 1)
    data['average_Dormant_proba'] = data[['lgr_Dormant_proba', 'xgb_Dormant_proba', 'rf_Dormant_proba']].agg(mean, axis = 1)
    data['average_Churned_proba'] = data[['lgr_Churned_proba', 'xgb_Churned_proba', 'rf_Churned_proba']].agg(mean, axis = 1)
    
    # Based on the definition of lifecycle, it is not possible for a customer to have the below stated transitions
    # Active -> Reactivated, Dormant -> Dormant, Dormant -> Active, Reactivated -> Reactivated
    # Hence, set the probabilities of these cases to 0
    data['average_Active_proba'] = np.where((data['Lifecycle'] == 'Dormant') & (data['average_Active_proba'] > 0), 0, data['average_Active_proba'])
    data['average_Reactivated_proba'] = np.where((data['Lifecycle'] in ('Active', 'Reactivated')) & (data['average_Reactivated_proba'] > 0), 0, data['average_Reactivated_proba'])
    data['average_Dormant_proba'] = np.where((data['Lifecycle'] == 'Dormant') & (data['average_Dormant_proba'] > 0), 0, data['average_Dormant_proba'])
    
    # The lifecycle with the highest probability will be the predicted lifecycle
    max_proba = data[['average_Active_proba', 'average_Reactivated_proba', 'average_Dormant_proba', 'average_Churned_proba']].agg(max, axis = 1)
    data['PredictedLifecycle'] = np.where(data['average_Active_proba'] == max_proba, 'Active', \
                                         +np.where(data['average_Reactivated_proba'] == max_proba, 'Reactivated', \
                                         +np.where(data['average_Dormant_proba'] == max_proba, 'Dormant', \
                                         +np.where(data['average_Churned_proba'] == max_proba, 'Churned'))))
    return data

In [None]:
# Filter the existing active customers from the train_data
train_drop_index = train_data[train_data['CurrLifecycle'] in ('Dormant', 'Churned')].index
train_active = train_data.drop(train_drop_index)

# Returns the df with predictions for train_active (This train data only consists of existing active customers)
train_active_prediction = prediction(lgr_models_list, xgb_models_list, rf_models_list, train_active, input_features)

# Returns the df with predictions for test_data 
# This test data consists of both active and churned customers to generate classification report for model performance evaluation
test_prediction = prediction(gr_models_list, xgb_models_list, rf_models_list, test_data, input_features)

# Generate classification report using the test predictions
report = classification_report(test_prediction['CurrLifecycle'], test_prediction['PredictedLifecycle'])

# Filter out the existing active customers from the test data
test_drop_index = test_prediction[test_prediction['CurrLifecycle'] in ('Dormant', 'Churned')].index
test_active_prediction = test_prediction.drop(test_drop_index)

# Concat the train_active_prediction and test_active_prediction data
predicted_data = pd.concat([train_active_prediction, test_active_prediction], ignore_index=True)    

In [None]:
report

In [None]:
predicted_data.head()

In [None]:
# Flask API to send report and predicted data to frontend
app = Flask(name)
CORS(app)

@app.route('/api/model')
def get_model():
    return jsonify(report)

@app.route('/api/data')
def get_data():
    return jsonify(predicted_data.to_dict())

if name == 'main':
    app.run(debug=True)