In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, classification_report, confusion_matrix

# Load Data from PostgreSQL
db_config = {
    'host': 'localhost',
    'database': 'Liberty',
    'user': 'postgres',
    'password': 'abc',
    'port': '5432'
}

connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

query = 'SELECT * FROM public.policydata_with_fb_cc_pc_newfea_opti_correct;'
data = pd.read_sql(query, con=engine)

In [2]:
selected_columns = [
                    'rto_risk_factor', 'ncb % previous year', 'state_risk_score', 'retention_rate_pct', 'total od premium_max', 'applicable discount with ncb', 
                    'policy_wise_purchase', 'manufacturer_risk_rate', 'days_between_renewals', 'retention_streak', 'total od premium_mean', 'total od premium', 
                    'firstpolicyyear', 'lag_1_tp_premium', 'total od premium_min', 'avg_premium_hist', 'lag_1_ncb', 'age', 'total tp premium_max', 'total tp premium_mean', 
                    'total tp premium', 'total tp premium_min', 'lag_1_premium', 'previous_year_premium_ratio', 'total premium payable', 'total_revenue', 'gst', 
                    'fuel_type_risk_factor', 'lag_1_od_premium', 'Customer_APV', 'segment_risk_score', 'vehicle idv', 'Policy Tenure', 'Number of claims', 'approved', 
                    'claim_approval_rate', 'Customer Tenure', 'before gst add-on gwp', 'od_tp_ratio', 'add_on_adoption', 'CLV', 'idv_premium_ratio', 'Customer_APF', 
                    'days_gap_prev_end_to_curr_start', 'customerid', 'Claim Happaned/Not', 'Cleaned Branch Name 2', 'Cleaned Chassis Number', 'Cleaned Engine Number', 
                    'Cleaned Reg no', 'Cleaned State2', 'Cleaned Zone 2', 'biztype', 'corrected_name', 'make_clean', 'model_clean', 'product name', 'policy no', 
                    'policy end date', 'policy start date', 'decline', 'tie up', 'variant', 'Policy Status'

]

data = data[selected_columns]

# Convert Policy End Date to datetime
data['policy end date'] = pd.to_datetime(data['policy end date'], errors='coerce')

# Separate Open Customers (January to March 2025)
open_customers = data[
    (data['Policy Status'] == 'Open') &
    (data['policy end date'].dt.year == 2025) &
    (data['policy end date'].dt.month.isin([1, 2, 3, 4, 5, 6]))
].copy()

# Filter the main dataset for customers whose Policy End Date is <= December 2024
data = data[data['Policy Status'].isin(['Renewed', 'Not Renewed'])]

# Map Policy Status to binary
data['Policy Status'] = data['Policy Status'].apply(lambda x: 1 if x == 'Not Renewed' else 0)

# Handle missing values
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].fillna('missing')
    else:
        data[column] = data[column].fillna(0)

# Extract year, month, and day from date columns
date_columns = ['policy start date', 'policy end date']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

# Extract date features
for col in date_columns:
    data[f'{col}_YEAR'] = data[col].dt.year
    data[f'{col}_MONTH'] = data[col].dt.month
    data[f'{col}_DAY'] = data[col].dt.day

# Drop original date columns
data.drop(columns=date_columns, inplace=True)

# Separate features and target variable for training
features = [col for col in data.columns if col != 'Policy Status']
X = data[features]
y = data['Policy Status']

from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, log_loss, roc_curve
import matplotlib.pyplot as plt

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Apply Random Oversampling to the training data
X, y = ros.fit_resample(X, y)

# Apply the same transformations to open_customers
for col in date_columns:
    open_customers[col] = pd.to_datetime(open_customers[col], errors='coerce')

for col in date_columns:
    open_customers[f'{col}_YEAR'] = open_customers[col].dt.year
    open_customers[f'{col}_MONTH'] = open_customers[col].dt.month
    open_customers[f'{col}_DAY'] = open_customers[col].dt.day

open_customers.drop(columns=date_columns, inplace=True)

for column in open_customers.columns:
    if open_customers[column].dtype == 'object':
        open_customers[column] = open_customers[column].fillna('missing')
    else:
        open_customers[column] = open_customers[column].fillna(0)

open_customers_without_encoded = open_customers.copy()

In [3]:
# Apply label encoding to categorical features
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        label_encoder = LabelEncoder()
        X[column] = label_encoder.fit_transform(X[column].astype(str))
        label_encoders[column] = label_encoder
        
        mapping_dict = {label: i for i, label in enumerate(label_encoder.classes_)}
        next_unique_value = max(mapping_dict.values()) + 1  

        def encode_test_value(value):
            return mapping_dict.get(value, next_unique_value)

        open_customers[column] = open_customers[column].apply(encode_test_value)

In [None]:
# Save predictions
open_customers_without_encoded.to_csv('future prediction rancat.csv', index=False)

In [4]:
from sklearn.tree import DecisionTreeClassifier

# XGBoost model
import xgboost as xgb

model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

Predicted Renewed: 106676
Predicted Not Renewed: 252197
Train Accuracy: 0.8548907916314036
Train Log Loss: 0.3430168189956369
Train ROC AUC: 0.9285334925831624
Train Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.85    947638
           1       0.85      0.86      0.86    947638

    accuracy                           0.85   1895276
   macro avg       0.86      0.85      0.85   1895276
weighted avg       0.86      0.85      0.85   1895276

Class 0 Train Accuracy: 0.8457396178709592
Class 1 Train Accuracy: 0.864041965391848


In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Default Threshold
threshold = 0.5

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Default Threshold (0.5) ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status Default'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability Default'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

=== Default Threshold (0.5) ===
Threshold: 0.5000
Train Accuracy: 0.8549
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85    947638
           1       0.85      0.86      0.86    947638

    accuracy                           0.85   1895276
   macro avg       0.86      0.85      0.85   1895276
weighted avg       0.86      0.85      0.85   1895276


Predicted Renewed: 106676
Predicted Not Renewed: 252197


In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Sensitivity = Specificity
sens_spec_diff = np.abs(tpr - (1 - fpr))
threshold = thresholds[np.argmin(sens_spec_diff)]

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Sensitivity = Specificity Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status SensSpec'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability SensSpec'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

=== Sensitivity = Specificity Threshold ===
Threshold: 0.5222
Train Accuracy: 0.8547
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85    947638
           1       0.85      0.85      0.85    947638

    accuracy                           0.85   1895276
   macro avg       0.85      0.85      0.85   1895276
weighted avg       0.85      0.85      0.85   1895276


Predicted Renewed: 117530
Predicted Not Renewed: 241343


In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Max Sensitivity + Specificity
sens_plus_spec = tpr + (1 - fpr)
threshold = thresholds[np.argmax(sens_plus_spec)]

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Max Sensitivity + Specificity Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status MaxSensSpec'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability MaxSensSpec'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())


=== Max Sensitivity + Specificity Threshold ===
Threshold: 0.5020
Train Accuracy: 0.8550
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85    947638
           1       0.85      0.86      0.86    947638

    accuracy                           0.85   1895276
   macro avg       0.86      0.85      0.85   1895276
weighted avg       0.86      0.85      0.85   1895276


Predicted Renewed: 107617
Predicted Not Renewed: 251256


In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Instead of using ALL thresholds, define a grid
threshold_grid = np.linspace(0, 1, 100)

accuracy_per_threshold = []
for t in threshold_grid:
    preds = (y_pred_proba_train >= t).astype(int)
    acc = accuracy_score(y, preds)
    accuracy_per_threshold.append(acc)

# Choose threshold with max accuracy
threshold = threshold_grid[np.argmax(accuracy_per_threshold)]

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Max Percent Correctly Classified Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status MaxPCC'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability MaxPCC'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

=== Max Percent Correctly Classified Threshold ===
Threshold: 0.5051
Train Accuracy: 0.8549
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85    947638
           1       0.85      0.86      0.86    947638

    accuracy                           0.85   1895276
   macro avg       0.85      0.85      0.85   1895276
weighted avg       0.85      0.85      0.85   1895276


Predicted Renewed: 109102
Predicted Not Renewed: 249771


In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Instead of using ALL thresholds, define a grid
threshold_grid = np.linspace(0, 1, 500)

accuracy_per_threshold = []
for t in threshold_grid:
    preds = (y_pred_proba_train >= t).astype(int)
    acc = accuracy_score(y, preds)
    accuracy_per_threshold.append(acc)

# Choose threshold with max accuracy
threshold = threshold_grid[np.argmax(accuracy_per_threshold)]

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Max Percent Correctly Classified Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status MaxPCC'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability MaxPCC'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

=== Max Percent Correctly Classified Threshold ===
Threshold: 0.5030
Train Accuracy: 0.8549
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85    947638
           1       0.85      0.86      0.86    947638

    accuracy                           0.85   1895276
   macro avg       0.86      0.85      0.85   1895276
weighted avg       0.86      0.85      0.85   1895276


Predicted Renewed: 108045
Predicted Not Renewed: 250828


In [15]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Instead of using ALL thresholds, define a grid
threshold_grid = np.linspace(0, 1, 50)

accuracy_per_threshold = []
for t in threshold_grid:
    preds = (y_pred_proba_train >= t).astype(int)
    acc = accuracy_score(y, preds)
    accuracy_per_threshold.append(acc)

# Choose threshold with max accuracy
threshold = threshold_grid[np.argmax(accuracy_per_threshold)]

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Max Percent Correctly Classified Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status MaxPCC'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability MaxPCC'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

=== Max Percent Correctly Classified Threshold ===
Threshold: 0.4898
Train Accuracy: 0.8548
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85    947638
           1       0.85      0.87      0.86    947638

    accuracy                           0.85   1895276
   macro avg       0.86      0.85      0.85   1895276
weighted avg       0.86      0.85      0.85   1895276


Predicted Renewed: 102110
Predicted Not Renewed: 256763


In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

# Sensitivity = Specificity Threshold
sens_spec_diff = np.abs(tpr - (1 - fpr))
threshold = thresholds[np.argmin(sens_spec_diff)]

# -------------------------------
# Training set evaluation
# -------------------------------
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Sensitivity = Specificity Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# -------------------------------
# Predict on open_customers
# -------------------------------
X_open_customers = open_customers[features]
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]

# Raw predictions at selected threshold
y_open_pred = (y_open_pred_proba >= threshold).astype(int)

# Store raw predictions
open_customers_without_encoded['Predicted Status SensSpec'] = np.where(
    y_open_pred == 1, 'Not Renewed', 'Renewed'
)
open_customers_without_encoded['Churn Probability SensSpec'] = y_open_pred_proba

# Raw prediction counts
print("\n=== Raw Predictions ===")
print("Predicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

# -------------------------------
# Recalibration
# -------------------------------
# Confirm training prevalence
train_churn = y.mean()
print("\nTraining Churn Prevalence:", train_churn)

# Real churn prevalence
real_churn = 0.63  # Use your production churn

# Compute odds
original_odds = real_churn / (1 - real_churn)
training_odds = train_churn / (1 - train_churn)

# Scoring odds
scoring_odds = y_open_pred_proba / (1 - y_open_pred_proba)

# Adjusted odds
adjusted_odds = (scoring_odds * original_odds) / training_odds

# Recalibrated probabilities
adjusted_probs = 1 / (1 + (1 / adjusted_odds))

# Store recalibrated probabilities
open_customers_without_encoded["Recalibrated Probability"] = adjusted_probs

# Show recalibration summary
print("\n=== Recalibrated Probabilities ===")
print("Mean Raw Probability: {:.4f}".format(y_open_pred_proba.mean()))
print("Mean Recalibrated Probability: {:.4f}".format(adjusted_probs.mean()))

# -------------------------------
# Binarize recalibrated probabilities (Threshold = 0.5)
# -------------------------------
recalibrated_pred = (adjusted_probs >= 0.5).astype(int)

# Store recalibrated predicted status
open_customers_without_encoded["Predicted Status Recalibrated"] = np.where(
    recalibrated_pred == 1, "Not Renewed", "Renewed"
)

# Recalibrated prediction counts
print("\n=== Recalibrated Predictions (Threshold = 0.5) ===")
print("Predicted Renewed:", (recalibrated_pred == 0).sum())
print("Predicted Not Renewed:", (recalibrated_pred == 1).sum())

=== Sensitivity = Specificity Threshold ===
Threshold: 0.5222
Train Accuracy: 0.8547
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85    947638
           1       0.85      0.85      0.85    947638

    accuracy                           0.85   1895276
   macro avg       0.85      0.85      0.85   1895276
weighted avg       0.85      0.85      0.85   1895276


=== Raw Predictions ===
Predicted Renewed: 117530
Predicted Not Renewed: 241343

Training Churn Prevalence: 0.5

=== Recalibrated Probabilities ===
Mean Raw Probability: 0.6128
Mean Recalibrated Probability: 0.7097

=== Recalibrated Predictions (Threshold = 0.5) ===
Predicted Renewed: 58137
Predicted Not Renewed: 300736


In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Instead of all thresholds, use a grid
threshold_grid = np.linspace(0, 1, 100)

# Compute predicted prevalence per threshold
predicted_prevalences = [(y_pred_proba_train >= t).mean() for t in threshold_grid]

# Find threshold where predicted prevalence is closest to observed prevalence
prevalence_diff = np.abs(np.array(predicted_prevalences) - observed_prevalence)
threshold = threshold_grid[np.argmin(prevalence_diff)]

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Predicted Prevalence = Observed Prevalence Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status PredPrev'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability PredPrev'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

=== Predicted Prevalence = Observed Prevalence Threshold ===
Threshold: 0.5253
Train Accuracy: 0.8546
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85    947638
           1       0.86      0.85      0.85    947638

    accuracy                           0.85   1895276
   macro avg       0.85      0.85      0.85   1895276
weighted avg       0.85      0.85      0.85   1895276


Predicted Renewed: 119137
Predicted Not Renewed: 239736


In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_curve, accuracy_score, classification_report,
    log_loss, roc_auc_score, confusion_matrix
)
import xgboost as xgb

# -------------------------------
# Train the model
# -------------------------------
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X, y)

# -------------------------------
# Compute thresholds
# -------------------------------
# Get predicted probabilities on training data
y_pred_proba_train = model.predict_proba(X)[:, 1]

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba_train)

observed_prevalence = y.mean()

# Instead of all thresholds, use a grid
threshold_grid = np.linspace(0, 1, 50)

# Compute predicted prevalence per threshold
predicted_prevalences = [(y_pred_proba_train >= t).mean() for t in threshold_grid]

# Find threshold where predicted prevalence is closest to observed prevalence
prevalence_diff = np.abs(np.array(predicted_prevalences) - observed_prevalence)
threshold = threshold_grid[np.argmin(prevalence_diff)]

# Training set evaluation
y_train_pred = (y_pred_proba_train >= threshold).astype(int)

accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_pred_proba_train)
logloss = log_loss(y, y_pred_proba_train)
report = classification_report(y, y_train_pred)

print("=== Predicted Prevalence = Observed Prevalence Threshold ===")
print(f"Threshold: {threshold:.4f}")
print(f"Train Accuracy: {accuracy:.4f}")
print(f"Train ROC AUC: {roc_auc:.4f}")
print(f"Train Log Loss: {logloss:.4f}")
print("Classification Report:\n", report)

# Predict on open_customers
y_open_pred = (y_open_pred_proba >= threshold).astype(int)
open_customers_without_encoded['Predicted Status PredPrev'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability PredPrev'] = y_open_pred_proba

print("\nPredicted Renewed:", (y_open_pred == 0).sum())
print("Predicted Not Renewed:", (y_open_pred == 1).sum())

=== Predicted Prevalence = Observed Prevalence Threshold ===
Threshold: 0.5306
Train Accuracy: 0.8546
Train ROC AUC: 0.9285
Train Log Loss: 0.3430
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.86    947638
           1       0.86      0.85      0.85    947638

    accuracy                           0.85   1895276
   macro avg       0.85      0.85      0.85   1895276
weighted avg       0.85      0.85      0.85   1895276


Predicted Renewed: 121990
Predicted Not Renewed: 236883


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

# XGBoost model
import xgboost as xgb

# Define three XGBoost models with different parameters
clf_xgb1 = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

clf_xgb2 = xgb.XGBClassifier(
    max_depth=6,                  
    learning_rate=0.1,            
    n_estimators=100,            
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]), 
    random_state=42
)

# Create a VotingClassifier ensemble with soft voting
model = VotingClassifier(
    estimators=[('xgb1', clf_xgb1), ('xgb2', clf_xgb2)],
    voting='soft'
)

model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

In [None]:

open_customers_without_encoded.to_csv('future prediction XGB test.csv', index=False)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

# XGBoost model
model = GradientBoostingClassifier(
    max_depth=6,                    
    learning_rate=0.1,              
    n_estimators=100,              
    random_state=42                 
)

model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

Predicted Renewed: 125102
Predicted Not Renewed: 233771
Train Accuracy: 0.8557096697262034
Train Log Loss: 0.3421041235205226
Train ROC AUC: 0.9292134744340229
Train Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.85    947638
           1       0.85      0.86      0.86    947638

    accuracy                           0.86   1895276
   macro avg       0.86      0.86      0.86   1895276
weighted avg       0.86      0.86      0.86   1895276

Class 0 Train Accuracy: 0.8487196587726537
Class 1 Train Accuracy: 0.8626996806797532


In [17]:
# Save predictions
open_customers_without_encoded.to_csv('future prediction GBM1.csv', index=False)

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

# XGBoost model
model = GradientBoostingClassifier(
    max_depth=5,                   
    learning_rate=0.05,             
    n_estimators=200,               
    subsample=0.8,                 
    random_state=42                 
)

model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

Predicted Renewed: 56226
Predicted Not Renewed: 302647
Train Accuracy: 0.8511409420052805
Train Log Loss: 0.35329877381012564
Train ROC AUC: 0.9244575007855427
Train Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85    947638
           1       0.84      0.86      0.85    947638

    accuracy                           0.85   1895276
   macro avg       0.85      0.85      0.85   1895276
weighted avg       0.85      0.85      0.85   1895276

Class 0 Train Accuracy: 0.8419079859608838
Class 1 Train Accuracy: 0.8603738980496772


In [None]:
import xgboost as xgb

# XGBoost model
model = xgb.XGBClassifier(
    max_depth=5,                  
    learning_rate=0.05,            
    n_estimators=200,              
    subsample=0.8,                 
    colsample_bytree=0.8,         
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]),  
    gamma=0.1,                    
    random_state=42
)



model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

In [None]:
from catboost import CatBoostClassifier

# XGBoost model
model = CatBoostClassifier(
    depth=10,                    
    learning_rate=0.1,            
    iterations=500,               
    random_seed=42,               
    verbose=0                     
)

model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

In [None]:

from sklearn.ensemble import RandomForestClassifier

# XGBoost model
model = RandomForestClassifier(random_state=42, max_depth=10)

model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

In [None]:

from sklearn.ensemble import RandomForestClassifier

# XGBoost model
model = DecisionTreeClassifier(random_state=42, max_depth=5)

model.fit(X, y)

# Predict using encoded data
X_open_customers = open_customers[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

# Store prediction results in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

# Evaluate model on training data
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")