In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, classification_report, log_loss
import matplotlib.pyplot as plt

data = pd.read_excel('Actual_Data_South.xlsx')

In [2]:
selected_columns = ['Policy No',  'Renewal Type', 'Product name', 'Product name  2', 'biztype', 'Policy End Date', 'Policy Start Date', 'Reg no', 'age', 
                    'MANUFACTURER/Make', 'model', 'variant', 'Fuel Type', 'RTO Location', 'Before GST Add-on GWP',  'Total OD Premium', 'Total TP Premium',
                    'gst', 'Total Premium Payable', 'NCB % Previous Year', 'Vehicle Segment', 'Applicable Discount with NCB', 'New Branch Name  2', 'decline',
                    'Tie Up', 'Zone 2', 'state2', 'Cleaned_Insured name', 'CustomerID', 'Policy Status', 'Policy Tenure', 'Customer Tenure', 'New Customers', 
                    'Churn Label', 'Renewal Rate Status', 'Claim in last year', 'Number of Claims']
data = data[selected_columns]

open_customers = data[data['Policy Status'] == 'Open'].copy()

# Remove rows where 'Status' contains 'Open'
data = data[data['Policy Status'].isin(['Renewed', 'Not Renewed'])]

data['Policy Status'] = data['Policy Status'].apply(lambda x: 1 if x == 'Not Renewed' else 0)

for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].fillna('missing')
    else:
        data[column] = data[column].fillna(0)

date_columns = ['Policy Start Date', 'Policy End Date']

for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')  

new_date_cols = {}
for col in date_columns:
    new_date_cols[f'{col}_YEAR'] = data[col].dt.year
    new_date_cols[f'{col}_MONTH'] = data[col].dt.month
    new_date_cols[f'{col}_DAY'] = data[col].dt.day

data = pd.concat([data, pd.DataFrame(new_date_cols)], axis=1)

data = data.drop(columns=date_columns)

features = [col for col in data.columns if col not in ['Policy Status']]

X = data[features]
y = data['Policy Status']

# Initialize RandomOverSampler
# ros = RandomOverSampler(random_state=42)

# X, y = ros.fit_resample(X, y)

In [3]:
#label encoding for the actual data
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        label_encoder = LabelEncoder()
        X[column] = label_encoder.fit_transform(X[column].astype(str))  
        label_encoders[column] = label_encoder  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = label_encoder.fit_transform(X[column].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = label_encoder.fit_transform(X[column].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = label_encoder.fit_transform(X[column].astype(str))
A value is tryin

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, log_loss, confusion_matrix

import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, classification_report, confusion_matrix, roc_curve
import matplotlib.pyplot as plt

model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

#Fit the model 
model.fit(X, y)

for col in date_columns:
    if col in open_customers.columns:
        open_customers[col] = pd.to_datetime(open_customers[col], errors='coerce')
    else:
        print(f"Warning: Column {col} not found in open_customers data")

open_customers_new_date_cols = {}
for col in date_columns:
    if col in open_customers.columns:
        open_customers_new_date_cols[f'{col}_YEAR'] = open_customers[col].dt.year
        open_customers_new_date_cols[f'{col}_MONTH'] = open_customers[col].dt.month
        open_customers_new_date_cols[f'{col}_DAY'] = open_customers[col].dt.day

if open_customers_new_date_cols:
    open_customers = pd.concat([open_customers, pd.DataFrame(open_customers_new_date_cols)], axis=1)

open_customers = open_customers.drop(columns=[col for col in date_columns if col in open_customers.columns])

for column in open_customers.columns:
    if open_customers[column].dtype == 'object':
        open_customers[column] = open_customers[column].fillna('missing')
    else:
        open_customers[column] = open_customers[column].fillna(0)

open_customers_encoded = open_customers.copy()

#Label Encoding for open customers
for column in open_customers_encoded.columns:
    if column in label_encoders:  
        encoder = label_encoders[column]

        mapping_dict = {label: i for i, label in enumerate(encoder.classes_)}
        next_unique_value = [max(mapping_dict.values()) + 1]  

        def encode_test_value(value):
            if value in mapping_dict:
                return mapping_dict[value]
            else:
                mapping_dict[value] = next_unique_value[0]
                next_unique_value[0] += 1
                return mapping_dict[value]
        
        open_customers_encoded[column] = open_customers_encoded[column].apply(encode_test_value)

#Predict the open customers
X_open_customers = open_customers_encoded[features]
y_open_pred = model.predict(X_open_customers)
y_open_pred_proba = model.predict_proba(X_open_customers)[:, 1]  

open_customers['Predicted Policy Status'] = ['Not Renewed' if pred == 1 else 'Renewed' for pred in y_open_pred]

print(f"Predicted Renewed: {(y_open_pred == 0).sum()}")
print(f"Predicted Not Renewed: {(y_open_pred == 1).sum()}")

y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

#Evaluate the model on training data
train_accuracy = accuracy_score(y, y_pred)
train_log_loss = log_loss(y, y_pred_proba)
train_roc_auc = roc_auc_score(y, y_pred_proba)
train_report = classification_report(y, y_pred)

conf_matrix_train = confusion_matrix(y, y_pred)
class_0_accuracy_train = conf_matrix_train[0, 0] / conf_matrix_train[0].sum()
class_1_accuracy_train = conf_matrix_train[1, 1] / conf_matrix_train[1].sum()

#Print the metrics
print(f"Train Accuracy: {train_accuracy}")
print(f"Train Log Loss: {train_log_loss}")
print(f"Train ROC AUC: {train_roc_auc}")
print(f"Train Classification Report:\n{train_report}")
print(f"Class 0 Train Accuracy: {class_0_accuracy_train}")
print(f"Class 1 Train Accuracy: {class_1_accuracy_train}")

open_customers.to_csv('open_customers_predictions (South 23&24).csv', index=False)

Predicted Renewed: 19017
Predicted Not Renewed: 6689
Train Accuracy: 0.9799913618694733
Train Log Loss: 0.056922946901133205
Train ROC AUC: 0.9963612298435494
Train Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    166148
           1       0.99      0.97      0.98    273762

    accuracy                           0.98    439910
   macro avg       0.98      0.98      0.98    439910
weighted avg       0.98      0.98      0.98    439910

Class 0 Train Accuracy: 0.9908936610732599
Class 1 Train Accuracy: 0.9733746831189135


In [None]:
X_open_customers.columns.values