In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, classification_report, confusion_matrix

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Load data from PostgreSQL
query = 'SELECT * FROM public.overall_cleaned_base_and_pr_ef_policyef;'
data = pd.read_sql(query, con=engine)

In [2]:
selected_columns = ['policy no', 'renewal type', 'product name', 'product name 2',  'biztype', 'policy end date', 'policy start date', 
 'age', 'manufacturer/make', 'model', 'variant', 'vehicle segment', 'fuel type', 'rto location', 'vehicle idv', 'ncb amount', 'Cleaned Reg no', 
 'before gst add-on gwp', 'total od premium', 'total tp premium', 'gst', 'total premium payable', 
 'ncb % previous year', 'applicable discount with ncb', 'Cleaned Branch Name 2', 'Cleaned State2', 'Cleaned Zone 2', 'tie up',
 'Number of claims', 'approved', 'denied', 'corrected_name', 'customerid', 'Policy Status', 'Policy Tenure', 'Customer Tenure', 'New Customers', 'Claim Happaned/Not', 
 'Renewal Rate Status', 'withdrawn', 'chassis_engine_key', 'policy_wise_purchase']

data = data[selected_columns]

# Convert Policy End Date to datetime
data['policy end date'] = pd.to_datetime(data['policy end date'], errors='coerce')

# Separate Open Customers (January to March 2025)
open_customers = data[
    (data['Policy Status'] == 'Open') &
    (data['policy end date'].dt.year == 2025) &
    (data['policy end date'].dt.month.isin([1, 2, 3, 4, 5, 6]))
].copy()

# Filter the main dataset for customers whose Policy End Date is <= December 2024
data = data[data['Policy Status'].isin(['Renewed', 'Not Renewed'])]

# Map Policy Status to binary
data['Policy Status'] = data['Policy Status'].apply(lambda x: 1 if x == 'Not Renewed' else 0)

# Handle missing values
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].fillna('missing')
    else:
        data[column] = data[column].fillna(0)

# Extract year, month, and day from date columns
date_columns = ['policy start date', 'policy end date']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

# Extract date features
for col in date_columns:
    data[f'{col}_YEAR'] = data[col].dt.year
    data[f'{col}_MONTH'] = data[col].dt.month
    data[f'{col}_DAY'] = data[col].dt.day

# Drop original date columns
data.drop(columns=date_columns, inplace=True)

# Separate features and target variable for training
features = [col for col in data.columns if col != 'Policy Status']
X = data[features]
y = data['Policy Status']

from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, log_loss, roc_curve
import matplotlib.pyplot as plt

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Apply Random Oversampling to the training data
X, y = ros.fit_resample(X, y)

# Apply the same transformations to open_customers
for col in date_columns:
    open_customers[col] = pd.to_datetime(open_customers[col], errors='coerce')

for col in date_columns:
    open_customers[f'{col}_YEAR'] = open_customers[col].dt.year
    open_customers[f'{col}_MONTH'] = open_customers[col].dt.month
    open_customers[f'{col}_DAY'] = open_customers[col].dt.day

open_customers.drop(columns=date_columns, inplace=True)

for column in open_customers.columns:
    if open_customers[column].dtype == 'object':
        open_customers[column] = open_customers[column].fillna('missing')
    else:
        open_customers[column] = open_customers[column].fillna(0)

open_customers_without_encoded = open_customers.copy()

In [3]:
# Apply label encoding to categorical features
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        label_encoder = LabelEncoder()
        X[column] = label_encoder.fit_transform(X[column].astype(str))
        label_encoders[column] = label_encoder
        
        mapping_dict = {label: i for i, label in enumerate(label_encoder.classes_)}
        next_unique_value = max(mapping_dict.values()) + 1  

        def encode_test_value(value):
            return mapping_dict.get(value, next_unique_value)

        open_customers[column] = open_customers[column].apply(encode_test_value)

In [4]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.40  # Model 1 works better for class 0
weight_2 = 0.60  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7465
Train Log Loss: 0.5109
Train ROC AUC: 0.8264
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.70      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7891
Class 1 Train Accuracy: 0.7039
Predicted Renewed: 124070
Predicted Not Renewed: 234803


In [None]:
# Save predictions
open_customers_without_encoded.to_csv('JFM_predictions_xgb 1.csv', index=False)

In [5]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.50  # Model 1 works better for class 0
weight_2 = 0.50  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7426
Train Log Loss: 0.5163
Train ROC AUC: 0.8222
Train Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.78      0.75    947638
           1       0.76      0.70      0.73    947638

    accuracy                           0.74   1895276
   macro avg       0.74      0.74      0.74   1895276
weighted avg       0.74      0.74      0.74   1895276

Class 0 Train Accuracy: 0.7847
Class 1 Train Accuracy: 0.7006
Predicted Renewed: 115251
Predicted Not Renewed: 243622


In [6]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.40  # Model 1 works better for class 0
weight_2 = 0.60  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7483
Train Log Loss: 0.5066
Train ROC AUC: 0.8283
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.70      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7927
Class 1 Train Accuracy: 0.7038
Predicted Renewed: 212172
Predicted Not Renewed: 146701


In [7]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.45  # Model 1 works better for class 0
weight_2 = 0.55  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7447
Train Log Loss: 0.5135
Train ROC AUC: 0.8243
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.70      0.73    947638

    accuracy                           0.74   1895276
   macro avg       0.75      0.74      0.74   1895276
weighted avg       0.75      0.74      0.74   1895276

Class 0 Train Accuracy: 0.7870
Class 1 Train Accuracy: 0.7024
Predicted Renewed: 119789
Predicted Not Renewed: 239084


In [8]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.35  # Model 1 works better for class 0
weight_2 = 0.65  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7483
Train Log Loss: 0.5084
Train ROC AUC: 0.8282
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7911
Class 1 Train Accuracy: 0.7055
Predicted Renewed: 128076
Predicted Not Renewed: 230797


In [9]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.60  # Model 1 works better for class 0
weight_2 = 0.40  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7383
Train Log Loss: 0.5220
Train ROC AUC: 0.8173
Train Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.78      0.75    947638
           1       0.76      0.70      0.73    947638

    accuracy                           0.74   1895276
   macro avg       0.74      0.74      0.74   1895276
weighted avg       0.74      0.74      0.74   1895276

Class 0 Train Accuracy: 0.7797
Class 1 Train Accuracy: 0.6970
Predicted Renewed: 105311
Predicted Not Renewed: 253562


In [10]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.30  # Model 1 works better for class 0
weight_2 = 0.70  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7498
Train Log Loss: 0.5059
Train ROC AUC: 0.8300
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7927
Class 1 Train Accuracy: 0.7068
Predicted Renewed: 131956
Predicted Not Renewed: 226917


In [11]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.25  # Model 1 works better for class 0
weight_2 = 0.75  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7512
Train Log Loss: 0.5036
Train ROC AUC: 0.8315
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7943
Class 1 Train Accuracy: 0.7081
Predicted Renewed: 135587
Predicted Not Renewed: 223286


In [12]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = CatBoostClassifier(
    depth=6, learning_rate=0.1, iterations=100, 
    random_seed=42, verbose=0)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.20  # Model 1 works better for class 0
weight_2 = 0.80  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7525
Train Log Loss: 0.5013
Train ROC AUC: 0.8330
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.80      0.76    947638
           1       0.78      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7957
Class 1 Train Accuracy: 0.7092
Predicted Renewed: 139177
Predicted Not Renewed: 219696


In [13]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = xgb.XGBClassifier(
    max_depth=5,                  
    learning_rate=0.05,            
    n_estimators=200,              
    subsample=0.8,                 
    colsample_bytree=0.8,         
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]),  
    gamma=0.1,                    
    random_state=42
)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.40  # Model 1 works better for class 0
weight_2 = 0.60  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7472
Train Log Loss: 0.5098
Train ROC AUC: 0.8270
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.70      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7914
Class 1 Train Accuracy: 0.7030
Predicted Renewed: 181676
Predicted Not Renewed: 177197


In [14]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = xgb.XGBClassifier(
    max_depth=5,                  
    learning_rate=0.05,            
    n_estimators=200,              
    subsample=0.8,                 
    colsample_bytree=0.8,         
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]),  
    gamma=0.1,                    
    random_state=42
)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.30  # Model 1 works better for class 0
weight_2 = 0.70  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7502
Train Log Loss: 0.5052
Train ROC AUC: 0.8304
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7945
Class 1 Train Accuracy: 0.7060
Predicted Renewed: 171300
Predicted Not Renewed: 187573


In [15]:
# Save predictions
open_customers_without_encoded.to_csv('JFMAMJ_predictions_xgbcat_combinedweighted.csv', index=False)

In [16]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = xgb.XGBClassifier(
    max_depth=5,                  
    learning_rate=0.05,            
    n_estimators=200,              
    subsample=0.8,                 
    colsample_bytree=0.8,         
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]),  
    gamma=0.1,                    
    random_state=42
)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.25  # Model 1 works better for class 0
weight_2 = 0.75  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7515
Train Log Loss: 0.5030
Train ROC AUC: 0.8319
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.80      0.76    947638
           1       0.78      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7957
Class 1 Train Accuracy: 0.7073
Predicted Renewed: 166943
Predicted Not Renewed: 191930


In [17]:
# Save predictions
open_customers_without_encoded.to_csv('JFMAMJ_predictions_xgbcat1_combinedweighted.csv', index=False)

In [18]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = xgb.XGBClassifier(
    max_depth=6,                  
    learning_rate=0.1,            
    n_estimators=100,            
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]), 
    random_state=42
)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.40  # Model 1 works better for class 0
weight_2 = 0.60  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7483
Train Log Loss: 0.5066
Train ROC AUC: 0.8283
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.70      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7927
Class 1 Train Accuracy: 0.7038
Predicted Renewed: 212172
Predicted Not Renewed: 146701


In [19]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
model_1 = xgb.XGBClassifier(
    max_depth=6,                  
    learning_rate=0.1,            
    n_estimators=100,            
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]), 
    random_state=42
)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.20  # Model 1 works better for class 0
weight_2 = 0.80  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7530
Train Log Loss: 0.4995
Train ROC AUC: 0.8336
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.80      0.76    947638
           1       0.78      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7971
Class 1 Train Accuracy: 0.7088
Predicted Renewed: 176955
Predicted Not Renewed: 181918


In [20]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(random_state=42, max_depth=10)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.40  # Model 1 works better for class 0
weight_2 = 0.60  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7491
Train Log Loss: 0.5168
Train ROC AUC: 0.8282
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7918
Class 1 Train Accuracy: 0.7065
Predicted Renewed: 125256
Predicted Not Renewed: 233617


In [21]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(random_state=42, max_depth=10)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.30  # Model 1 works better for class 0
weight_2 = 0.70  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7517
Train Log Loss: 0.5100
Train ROC AUC: 0.8316
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.78      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7950
Class 1 Train Accuracy: 0.7085
Predicted Renewed: 133873
Predicted Not Renewed: 225000


In [22]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(random_state=42, max_depth=10)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.50  # Model 1 works better for class 0
weight_2 = 0.50  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7455
Train Log Loss: 0.5242
Train ROC AUC: 0.8239
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.70      0.73    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7874
Class 1 Train Accuracy: 0.7037
Predicted Renewed: 114047
Predicted Not Renewed: 244826


In [23]:
# Save predictions
open_customers_without_encoded.to_csv('JFMAMJ_predictions_rancat1_combinedweighted.csv', index=False)

In [24]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(random_state=42, max_depth=10)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.30  # Model 1 works better for class 0
weight_2 = 0.70  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7517
Train Log Loss: 0.5100
Train ROC AUC: 0.8316
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.78      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7950
Class 1 Train Accuracy: 0.7085
Predicted Renewed: 133873
Predicted Not Renewed: 225000


In [27]:
# Save predictions
open_customers_without_encoded.to_csv('JFMAMJ_rancat30_70.csv', index=False)

In [28]:
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier

# Define the models
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(random_state=42, max_depth=10)

model_2 = CatBoostClassifier(
    depth=10, learning_rate=0.1, iterations=500, 
    random_seed=42, verbose=0)

# Train both models on the full dataset
model_1.fit(X, y)
model_2.fit(X, y)

# Get probability predictions from both models
probs_1 = model_1.predict_proba(X)
probs_2 = model_2.predict_proba(X)

# Define model weights (adjustable)
weight_1 = 0.45  # Model 1 works better for class 0
weight_2 = 0.55  # Model 2 works better for class 1

# Compute weighted probability average
weighted_probs = (weight_1 * probs_1) + (weight_2 * probs_2)

# Apply argmax to get final class predictions
y_pred = np.argmax(weighted_probs, axis=1)
y_pred_proba = weighted_probs[:, 1]

# Evaluate model performance
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

# Compute confusion matrix and class-specific accuracy
conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

# Print model evaluation metrics
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Log Loss: {train_log_loss:.4f}")
print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train:.4f}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train:.4f}")

# ---- Apply Model on Open Customers ----
X_open_customers = open_customers[features]

# Get probability predictions for open customers
probs_1_open = model_1.predict_proba(X_open_customers)
probs_2_open = model_2.predict_proba(X_open_customers)

# Compute weighted average of probabilities
weighted_probs_open = (weight_1 * probs_1_open) + (weight_2 * probs_2_open)

# Get final predictions
y_open_pred = np.argmax(weighted_probs_open, axis=1)
y_open_pred_proba = weighted_probs_open[:, 1]

# Store predictions in unencoded data
open_customers_without_encoded['Predicted Status'] = np.where(y_open_pred == 1, 'Not Renewed', 'Renewed')
open_customers_without_encoded['Churn Probability'] = y_open_pred_proba

# Print counts of predictions
print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

Train Accuracy: 0.7474
Train Log Loss: 0.5204
Train ROC AUC: 0.8262
Train Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76    947638
           1       0.77      0.71      0.74    947638

    accuracy                           0.75   1895276
   macro avg       0.75      0.75      0.75   1895276
weighted avg       0.75      0.75      0.75   1895276

Class 0 Train Accuracy: 0.7897
Class 1 Train Accuracy: 0.7051
Predicted Renewed: 120024
Predicted Not Renewed: 238849


In [29]:
# Save predictions
open_customers_without_encoded.to_csv('JFMAMJ_rancat45_55.csv', index=False)