In [1]:
import pandas as pd

df = pd.read_csv('XGB_predictions_JFM(Final)_with_reasons.csv')

# Define reason categories
reason_buckets = {
    "Policy Type Related": ["Renewal Business Type", "Policy having TY Onwards Renewal Type", "PrivateCarPolicy Product", 
                    "Tie Up with MARUTI", "Tie Up with MIBL OEM", "Tie Up with TATA PV"],
    "Vehicle Age Related": ["Young Vehicle Age"],
    "Premium Related": ["Low Vehicle IDV", "High Own-Damage Premium", "Low Discount Offered", "Low No Claim Bonus Percentage", 
                        "High Total Premium Payable", "High Add-On Premium", "High Third-Party Premium", "Higher Renewal Premium Impact"],
    "Claim Related": ["Claims Happened"],
    "Organic Churn": ["Organic Churn"]
}

# Function to classify reason buckets
def classify_reason(row):
    # Handle null or empty rows
    if pd.isnull(row) or row.strip() == "":
        return None
    
    # Initialize an empty set for categories
    categories = set()
    
    # Split reasons and check each reason's category
    reasons = row.split(", ")
    for reason in reasons:
        for category, keywords in reason_buckets.items():
            if reason in keywords:
                categories.add(category)
    
    # Sort categories alphabetically
    sorted_categories = sorted(categories)
    
    # Format output with commas and "and"
    if len(sorted_categories) > 1:
        return ", ".join(sorted_categories[:-1]) + " and " + sorted_categories[-1] + " issues"
    elif sorted_categories:
        return sorted_categories[0] + " issues"
    else:
        return None  

# Ensure 'Not Renewed Reasons' column is treated as string and apply the function
df['Not Renewed Reasons'] = df['Not Renewed Reasons'].astype(str)
df['Reason Buckets'] = df['Not Renewed Reasons'].apply(classify_reason)

df.to_csv('XGB_predictions_JFM(Final)_with_reasons_buckets.csv', index=False)