In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}

# Create database connection engine
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load only CustomerID and CLV from PostgreSQL
query = 'SELECT "CustomerID", "CLV" FROM customer_clv_overallbase;'
clv_data = pd.read_sql(query, con=engine)

# Convert CustomerID to string (to avoid dtype mismatch)
clv_data["CustomerID"] = clv_data["CustomerID"].astype(str)

# Step 2: Load the CSV file containing CustomerID and other data
csv_file_path = "SOND_predictions_xgb 1.csv"
csv_data = pd.read_csv(csv_file_path)

# Convert CustomerID to string (to ensure compatibility)
csv_data["CustomerID"] = csv_data["CustomerID"].astype(str)

# Step 3: Merge only CLV data into the CSV file based on CustomerID
merged_data = csv_data.merge(clv_data, on="CustomerID", how="left")

# Step 4: Save the merged data back to a new CSV file
output_csv_path = "SOND_predictions_xgb 1_clv.csv"
merged_data.to_csv(output_csv_path, index=False)

print("Merged data saved to:", output_csv_path)

Merged data saved to: SOND_predictions_xgb 1_clv.csv


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('SOND_predictions_xgb 1_clv.csv')

# Strip spaces from column names (if any)
df.columns = df.columns.str.strip()

# Convert relevant columns to numeric (forcing errors to NaN)
df['Total Premium Payable'] = pd.to_numeric(df['Total Premium Payable'], errors='coerce')
df['Applicable Discount with NCB'] = pd.to_numeric(df['Applicable Discount with NCB'], errors='coerce')
df['Churn Probability'] = pd.to_numeric(df['Churn Probability'], errors='coerce')

# Compute customer-wise average values
customer_avg = df.groupby('CustomerID', as_index=False).agg({
    'Total Premium Payable': 'mean',
    'Applicable Discount with NCB': 'mean',
    'Churn Probability': 'mean'
})

# Rename columns for clarity
customer_avg.rename(columns={
    'Total Premium Payable': 'Avg_Premium',
    'Applicable Discount with NCB': 'Avg_Discount',
    'Churn Probability': 'Avg_Churn_Probability'
}, inplace=True)

# Merge back to original dataframe
df = df.merge(customer_avg, on='CustomerID', how='left')

# Save the processed data to a new CSV file
df.to_csv('SOND_predictions_xgb_1_with_avgvalues.csv', index=False)

print("Processing complete! The cleaned and averaged data has been saved.")

Processing complete! The cleaned and averaged data has been saved.


In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv('SOND_predictions_xgb_1_with_avgvalues.csv')

# Define thresholds for high/mid/low based on the overall dataset
high_payment_threshold = data['Avg_Premium'].quantile(0.75)
mid_payment_threshold = data['Avg_Premium'].quantile(0.5)
low_payment_threshold = data['Avg_Premium'].quantile(0.25)

high_discount_threshold = data['Avg_Discount'].quantile(0.75)
mid_discount_threshold = data['Avg_Discount'].quantile(0.5)
low_discount_threshold = data['Avg_Discount'].quantile(0.25)

high_clv_threshold = data['CLV'].quantile(0.75)
mid_clv_threshold = data['CLV'].quantile(0.5)
low_clv_threshold = data['CLV'].quantile(0.25)

# Assuming 'Churn Probability' is a column in the dataset
high_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.75)
mid_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.5)
low_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.25)

#Assign CLV and Payment categories
data['clv_category'] = data['CLV'].apply(
    lambda x: 'High' if x > high_clv_threshold else ('Mid' if x > mid_clv_threshold else 'Low')
)
data['payment_category'] = data['Avg_Premium'].apply(
    lambda x: 'High' if x > high_payment_threshold else ('Mid' if x > mid_payment_threshold else 'Low')
)
# Assign discount categories
data['discount_category'] = data['Avg_Discount'].apply(
    lambda x: 'High' if x > high_discount_threshold else ('Mid' if x > mid_discount_threshold else 'Low')
)
#Determine churn status from a column indicating churn, assuming 'Not Renewed' indicates churn
data['churn_category'] = data['Avg_Churn_Probability'].apply(
     lambda x: 'High' if x > high_churn_probability_threshold else ('Mid' if x > mid_churn_probability_threshold else 'Low')
)

# Filter the dataset for Not Renewed policies
not_renewed_data = data[data['Predicted Status'] == 'Not Renewed']

# Segment Single Loan Customers using all possible combinations
def segment_policy(row):
    # High Value Policy combinations
    if row['churn_category'] == 'Low':
        if row['payment_category'] == 'High':
            if row['discount_category'] == 'High':
                return 'Mid Value Policy'
            elif row['discount_category'] == 'Mid':
                if row['clv_category'] in ['High', 'Mid']:
                    return 'High Value Policy'
                elif row['clv_category'] == 'Low':
                    return 'High Value Policy'
            elif row['discount_category'] == 'Low':
                if row['clv_category'] in ['High', 'Mid']:
                    return 'High Value Policy'
                elif row['clv_category'] == 'Low':
                    return 'High Value Policy'
        elif row['payment_category'] == 'Mid':
            if row['discount_category'] == 'High':
                return 'Mid Value Policy'
            elif row['discount_category'] == 'Mid':
                if row['clv_category'] in ['Mid', 'Low']:
                    return 'High Value Policy'
                elif row['clv_category'] == 'High':
                    return 'High Value Policy'
            elif row['discount_category'] == 'Low':
                if row['clv_category'] in ['Low', 'Mid']:
                    return 'High Value Policy'
                elif row['clv_category'] == 'High':
                    return 'High Value Policy'
        elif row['payment_category'] == 'Low':
            if row['discount_category'] == 'High':
                return 'Mid Value Policy'
            elif row['discount_category'] in ['Mid', 'Low']:
                if row['clv_category'] in ['Low', 'Mid']:
                    return 'High Value Policy'
                elif row['clv_category'] == 'High':
                    return 'High Value Policy'
    
    # Mid Value Policy combinations
    if row['churn_category'] == 'Mid':
        if row['payment_category'] == 'High':
            if row['discount_category'] == 'High':
                return 'Mid Value Policy'
            elif row['discount_category'] == 'Mid':
                if row['clv_category'] in ['High', 'Mid']:
                    return 'Mid Value Policy'
                elif row['clv_category'] == 'Low':
                    return 'Mid Value Policy'
            elif row['discount_category'] == 'Low':
                if row['clv_category'] in ['High', 'Mid']:
                    return 'Mid Value Policy'
                elif row['clv_category'] == 'Low':
                    return 'Mid Value Policy'
        elif row['payment_category'] == 'Mid':
            if row['discount_category'] == 'High':
                return 'Mid Value Policy'
            elif row['discount_category'] == 'Mid':
                if row['clv_category'] in ['Mid', 'Low']:
                    return 'Mid Value Policy'
                elif row['clv_category'] == 'High':
                    return 'Mid Value Policy'
            elif row['discount_category'] == 'Low':
                if row['clv_category'] in ['Low', 'Mid']:
                    return 'Mid Value Policy'
                elif row['clv_category'] == 'High':
                    return 'Mid Value Policy'
        elif row['payment_category'] == 'Low':
            if row['discount_category'] == 'High':
                return 'Mid Value Policy'
            elif row['discount_category'] in ['Mid', 'Low']:
                if row['clv_category'] in ['Low', 'Mid']:
                    return 'Mid Value Policy'
                elif row['clv_category'] == 'High':
                    return 'Mid Value Policy'

    # Low value Policy combinations
    elif row['churn_category'] == 'High':
        if row['payment_category'] == 'High':
            if row['discount_category'] == 'High':
                if row['clv_category'] == 'High':
                    return 'Low Value Policy'
                elif row['clv_category'] == 'Mid':
                    return 'Low Value Policy'
                elif row['clv_category'] == 'Low':
                    return 'Low Value Policy'
            elif row['discount_category'] == 'Mid':
                if row['clv_category'] in ['High', 'Mid']:
                    return 'Low Value Policy'
                elif row['clv_category'] == 'Low':
                    return 'Low Value Policy'
            elif row['discount_category'] == 'Low':
                if row['clv_category'] == 'High':
                    return 'Low Value Policy'
                elif row['clv_category'] in ['Mid', 'Low']:
                    return 'Low Value Policy'
        elif row['payment_category'] == 'Mid':
            if row['discount_category'] == 'High':
                if row['clv_category'] == 'High':
                    return 'Low Value Policy'
                elif row['clv_category'] in ['Mid', 'Low']:
                    return 'Low Value Policy'
            elif row['discount_category'] == 'Mid':
                if row['clv_category'] in ['Mid', 'Low']:
                    return 'Low Value Policy'
                elif row['clv_category'] == 'High':
                    return 'Low Value Policy'
            elif row['discount_category'] == 'Low':
                if row['clv_category'] in ['Low', 'Mid']:
                    return 'Low Value Policy'
                elif row['clv_category'] == 'High':
                    return 'Low Value Policy'
        elif row['payment_category'] == 'Low':
            if row['discount_category'] in ['High', 'Mid', 'Low']:
                if row['clv_category'] in ['High', 'Mid', 'Low']:
                    return 'Low Value Policy'
    
    # Default fallback
    return 'Other'


# Apply the segmentation function to the Not Renewed dataset
not_renewed_data['policy_segment'] = not_renewed_data.apply(segment_policy, axis=1)

# If you want to merge this back to the original dataset
data = data.merge(not_renewed_data[['Policy No', 'policy_segment']], on='Policy No', how='left')

# Save the results to a CSV file
data.to_csv('SOND_predictions_xgb_1_with_avgvalues_seg.csv', index=False)

# Print all threshold values
print("Threshold Values:")
print(f"Total Premium Payable - High: {high_payment_threshold}, Mid: {mid_payment_threshold}, Low: {low_payment_threshold}")
print(f"Applicable Discount with NCB - High: {high_discount_threshold}, Mid: {mid_discount_threshold}, Low: {low_discount_threshold}")
print(f"CLV - High: {high_clv_threshold}, Mid: {mid_clv_threshold}, Low: {low_clv_threshold}")
print(f"Churn Probability - High: {high_churn_probability_threshold}, Mid: {mid_churn_probability_threshold}, Low: {low_churn_probability_threshold}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_renewed_data['policy_segment'] = not_renewed_data.apply(segment_policy, axis=1)


Threshold Values:
Total Premium Payable - High: 13028.0, Mid: 9595.0, Low: 7004.0
Applicable Discount with NCB - High: 77.5, Mid: 70.0, Low: 55.0
CLV - High: 0.0640043700288852, Mid: 0.03242842836214955, Low: 0.0158827375582463
Churn Probability - High: 0.6908574700000001, Mid: 0.5998879575, Low: 0.457963275


In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('SOND_predictions_xgb_1_with_avgvalues.csv')

# Define thresholds for high/mid/low categories
high_payment_threshold = data['Avg_Premium'].quantile(0.75)
mid_payment_threshold = data['Avg_Premium'].quantile(0.5)
low_payment_threshold = data['Avg_Premium'].quantile(0.25)

high_discount_threshold = data['Avg_Discount'].quantile(0.75)
mid_discount_threshold = data['Avg_Discount'].quantile(0.5)
low_discount_threshold = data['Avg_Discount'].quantile(0.25)

high_clv_threshold = data['CLV'].quantile(0.75)
mid_clv_threshold = data['CLV'].quantile(0.5)
low_clv_threshold = data['CLV'].quantile(0.25)

high_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.75)
mid_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.5)
low_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.25)

# Assign CLV, Payment, Discount, and Churn categories
data['clv_category'] = data['CLV'].apply(
    lambda x: 'High' if x > high_clv_threshold else ('Mid' if x > mid_clv_threshold else 'Low')
)
data['payment_category'] = data['Avg_Premium'].apply(
    lambda x: 'High' if x > high_payment_threshold else ('Mid' if x > mid_payment_threshold else 'Low')
)
data['discount_category'] = data['Avg_Discount'].apply(
    lambda x: 'High' if x > high_discount_threshold else ('Mid' if x > mid_discount_threshold else 'Low')
)
data['churn_category'] = data['Avg_Churn_Probability'].apply(
    lambda x: 'High' if x > high_churn_probability_threshold else ('Mid' if x > mid_churn_probability_threshold else 'Low')
)

# Count the number of policies per customer
data['Policy Count'] = data.groupby('CustomerID')['Policy No'].transform('count')

# Ensure 'Not All Renewed' column exists properly
customer_policy_status = data.groupby('CustomerID')['Predicted Status'].apply(
    lambda x: 'All Renewed' if all(x == 'Renewed') else 'Not All Renewed'
).reset_index(name='Renewal Status')  # Renaming column explicitly

# Merge back with the main dataset
data = data.merge(customer_policy_status, on='CustomerID', how='left')

# Fill any missing values in 'Renewal Status'
data['Renewal Status'] = data['Renewal Status'].fillna('All Renewed')

# Function to segment customers based on the new criteria
def segment_policy(row):
    if row['Renewal Status'] == 'Not All Renewed':  # Process only those who have at least one Not Renewed policy
        if row['churn_category'] == 'Mid' and row['discount_category'] in ['Mid', 'Low'] and row['payment_category'] in ['High', 'Mid', 'Low'] and row['clv_category'] in ['High', 'Mid', 'Low']:
            return 'Elite Retainers'
        elif row['churn_category'] in ['Low', 'Mid'] and row['discount_category'] in ['High', 'Mid', 'Low'] and row['payment_category'] in ['High', 'Mid', 'Low'] and row['clv_category'] in ['High', 'Mid', 'Low']:
            return 'Potential Customers'
        elif row['churn_category'] == 'High' and row['discount_category'] in ['High', 'Mid', 'Low'] and row['payment_category'] in ['High', 'Mid', 'Low'] and row['clv_category'] in ['High', 'Mid', 'Low']:
            return 'Low Value Customers'
    return None

# Apply the segmentation function
data['policy_segment'] = data.apply(segment_policy, axis=1)

# Save the results to a CSV file
data.to_csv('SOND_predictions_xgb_1_with_avgvalues_seg.csv', index=False)

# Print threshold values
print("Threshold Values:")
print(f"Total Premium Payable - High: {high_payment_threshold}, Mid: {mid_payment_threshold}, Low: {low_payment_threshold}")
print(f"Applicable Discount with NCB - High: {high_discount_threshold}, Mid: {mid_discount_threshold}, Low: {low_discount_threshold}")
print(f"CLV - High: {high_clv_threshold}, Mid: {mid_clv_threshold}, Low: {low_clv_threshold}")
print(f"Churn Probability - High: {high_churn_probability_threshold}, Mid: {mid_churn_probability_threshold}, Low: {low_churn_probability_threshold}")

Threshold Values:
Total Premium Payable - High: 13028.0, Mid: 9595.0, Low: 7004.0
Applicable Discount with NCB - High: 77.5, Mid: 70.0, Low: 55.0
CLV - High: 0.0640043700288852, Mid: 0.03242842836214955, Low: 0.0158827375582463
Churn Probability - High: 0.6908574700000001, Mid: 0.5998879575, Low: 0.457963275


In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('SOND_predictions_xgb_1_with_avgvalues.csv')

high_discount_threshold = data['Avg_Discount'].quantile(0.75)
mid_discount_threshold = data['Avg_Discount'].quantile(0.5)
low_discount_threshold = data['Avg_Discount'].quantile(0.25)

high_clv_threshold = data['CLV'].quantile(0.75)
mid_clv_threshold = data['CLV'].quantile(0.5)
low_clv_threshold = data['CLV'].quantile(0.25)

high_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.75)
mid_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.5)
low_churn_probability_threshold = data['Avg_Churn_Probability'].quantile(0.25)

# Assign CLV, Payment, Discount, and Churn categories
data['clv_category'] = data['CLV'].apply(
    lambda x: 'High' if x > high_clv_threshold else ('Mid' if x > mid_clv_threshold else 'Low')
)
data['discount_category'] = data['Avg_Discount'].apply(
    lambda x: 'High' if x > high_discount_threshold else ('Mid' if x > mid_discount_threshold else 'Low')
)
data['churn_category'] = data['Avg_Churn_Probability'].apply(
    lambda x: 'High' if x > high_churn_probability_threshold else ('Mid' if x > mid_churn_probability_threshold else 'Low')
)

# Count the number of policies per customer
data['Policy Count'] = data.groupby('CustomerID')['Policy No'].transform('count')

# Ensure 'Not All Renewed' column exists properly
customer_policy_status = data.groupby('CustomerID')['Predicted Status'].apply(
    lambda x: 'All Renewed' if all(x == 'Renewed') else 'Not All Renewed'
).reset_index(name='Renewal Status')  # Renaming column explicitly

# Merge back with the main dataset
data = data.merge(customer_policy_status, on='CustomerID', how='left')

# Fill any missing values in 'Renewal Status'
data['Renewal Status'] = data['Renewal Status'].fillna('All Renewed')

# Function to segment customers based on the new criteria
def segment_policy(row):
    if row['Renewal Status'] == 'Not All Renewed':  # Process only those who have at least one Not Renewed policy
        if row['churn_category'] == 'Mid' and row['discount_category'] in ['Mid', 'Low'] and row['clv_category'] in ['High', 'Mid']:
            return 'Elite Retainers'
        elif row['churn_category'] in ['Low', 'Mid'] and row['discount_category'] in ['High', 'Mid', 'Low'] and row['clv_category'] in ['High', 'Mid', 'Low']:
            return 'Potential Customers'
        elif row['churn_category'] == 'High' and row['discount_category'] in ['High', 'Mid', 'Low'] and row['clv_category'] in ['High', 'Mid', 'Low']:
            return 'Low Value Customers'
    return None

# Apply the segmentation function
data['policy_segment'] = data.apply(segment_policy, axis=1)

# Save the results to a CSV file
data.to_csv('SOND_predictions_xgb_1_with_avgvalues_seg.csv', index=False)

# Print threshold values
print("Threshold Values:")
print(f"Applicable Discount with NCB - High: {high_discount_threshold}, Mid: {mid_discount_threshold}, Low: {low_discount_threshold}")
print(f"CLV - High: {high_clv_threshold}, Mid: {mid_clv_threshold}, Low: {low_clv_threshold}")
print(f"Churn Probability - High: {high_churn_probability_threshold}, Mid: {mid_churn_probability_threshold}, Low: {low_churn_probability_threshold}")

Threshold Values:
Applicable Discount with NCB - High: 77.5, Mid: 70.0, Low: 55.0
CLV - High: 0.0640043700288852, Mid: 0.03242842836214955, Low: 0.0158827375582463
Churn Probability - High: 0.6908574700000001, Mid: 0.5998879575, Low: 0.457963275


In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('SOND_predictions_xgb_1_with_avgvalues.csv')

high_discount_threshold = data['Avg_Discount'].quantile(0.75)
mid_discount_threshold = data['Avg_Discount'].quantile(0.5)
low_discount_threshold = data['Avg_Discount'].quantile(0.25)

high_clv_threshold = data['CLV'].quantile(0.75)
mid_clv_threshold = data['CLV'].quantile(0.5)
low_clv_threshold = data['CLV'].quantile(0.25)

high_churn_probability_threshold = 
mid_churn_probability_threshold = 
low_churn_probability_threshold = 

# Assign CLV, Payment, Discount, and Churn categories
data['clv_category'] = data['CLV'].apply(
    lambda x: 'High' if x > high_clv_threshold else ('Mid' if x > mid_clv_threshold else 'Low')
)
data['discount_category'] = data['Avg_Discount'].apply(
    lambda x: 'High' if x > high_discount_threshold else ('Mid' if x > mid_discount_threshold else 'Low')
)
data['churn_category'] = data['Avg_Churn_Probability'].apply(
    lambda x: 'High' if x > high_churn_probability_threshold else ('Mid' if x > mid_churn_probability_threshold else 'Low')
)

# Count the number of policies per customer
data['Policy Count'] = data.groupby('CustomerID')['Policy No'].transform('count')

# Ensure 'Not All Renewed' column exists properly
customer_policy_status = data.groupby('CustomerID')['Predicted Status'].apply(
    lambda x: 'All Renewed' if all(x == 'Renewed') else 'Not All Renewed'
).reset_index(name='Renewal Status')  # Renaming column explicitly

# Merge back with the main dataset
data = data.merge(customer_policy_status, on='CustomerID', how='left')

# Fill any missing values in 'Renewal Status'
data['Renewal Status'] = data['Renewal Status'].fillna('All Renewed')

# Function to segment customers based on the new criteria
def segment_policy(row):
    if row['Renewal Status'] == 'Not All Renewed':  # Process only those who have at least one Not Renewed policy
        if row['churn_category'] == 'Mid' and row['discount_category'] in ['Mid', 'Low'] and row['clv_category'] in ['High', 'Mid']:
            return 'Elite Retainers'
        elif row['churn_category'] in ['Low', 'Mid'] and row['discount_category'] in ['High', 'Mid', 'Low'] and row['clv_category'] in ['High', 'Mid', 'Low']:
            return 'Potential Customers'
        elif row['churn_category'] == 'High' and row['discount_category'] in ['High', 'Mid', 'Low'] and row['clv_category'] in ['High', 'Mid', 'Low']:
            return 'Low Value Customers'
    return None

# Apply the segmentation function
data['policy_segment'] = data.apply(segment_policy, axis=1)

# Save the results to a CSV file
data.to_csv('SOND_predictions_xgb_1_with_avgvalues_seg.csv', index=False)

# Print threshold values
print("Threshold Values:")
print(f"Applicable Discount with NCB - High: {high_discount_threshold}, Mid: {mid_discount_threshold}, Low: {low_discount_threshold}")
print(f"CLV - High: {high_clv_threshold}, Mid: {mid_clv_threshold}, Low: {low_clv_threshold}")
print(f"Churn Probability - High: {high_churn_probability_threshold}, Mid: {mid_churn_probability_threshold}, Low: {low_churn_probability_threshold}")