In [5]:
import pandas as pd

data = pd.read_excel('cleaned_merged_base_data.xlsx')

In [None]:
# Extract the year from Next year Policy Start Date for year-wise grouping
data['Start Year'] = pd.to_datetime(data['Policy Start Date']).dt.year

# Map 'Policy Status' directly in the 'data' DataFrame
data['Policy Status'] = data['booked'].map(lambda x: 'Not Renewed' if x == 0 else 'Renewed' if x == 1 else 'Open')

# Verify the mapping
print(data[['booked', 'Policy Status']].head())

# Convert relevant columns to numeric type to handle aggregation correctly
numeric_columns = [
    'Total Premium Payable ', 'Total OD Premium', 'Total TP Premium', 'NCB % Previous Year', 
    'Nil Depreciation', 'Passenger Assist', 'Consumable Cover', 'Engine Safe Cover', 'Claim in last year',
    'Road Side Assistance', 'Key Loss', 'gst', 'Applicable Discount with NCB', 'Before GST Add-on GWP'
]
for column in numeric_columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# Create a unique identifier for each vehicle by combining 'Reg no' and 'Variant'
data['Number of Vehicles'] = data['Reg no '].str.strip() + '_' + data['variant'].str.strip()

# Perform grouping and aggregation
customer_data = data.groupby(['CustomerID', 'Start Year']).agg({
    'Cleaned_Insured name': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Reg no ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'MANUFACTURER/Make': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'model': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'variant': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Fuel Type': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'RTO Location ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Product name ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Product name  2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'biztype': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Renewal Type': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'age': lambda x: ', '.join(x.dropna().astype(str)),
    'Vehicle Segment': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Number of Vehicles': 'nunique',
    'Policy Start Date': 'min',
    'Policy End Date': 'max',
    'Initial Policy No': 'count',  
    'Total Premium Payable ': 'sum',
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Before GST Add-on GWP' : 'sum',
    'gst': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'Nil Depreciation': 'sum',
    'Passenger Assist': 'sum',
    'Consumable Cover': 'sum',
    'Engine Safe Cover': 'sum',
    'Road Side Assistance': 'sum',
    'Key Loss': 'sum',
    'Number of Claims': 'sum',
    'decline': lambda x: x[x == 'Yes'].count(),
    'New Branch Name  2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Zone 2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'state2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Policy Status': lambda x: ', '.join(x.dropna().astype(str))
}).reset_index()

# Calculate metrics for Renewed and Not Renewed policies separately
renewed_data = data[data['Policy Status'] == 'Renewed']
not_renewed_data = data[data['Policy Status'] == 'Not Renewed']

renewed_metrics = renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Total Premium Payable ': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'gst': 'sum',
    'Before GST Add-on GWP': 'sum'
}).rename(columns={
    'Total OD Premium': 'Total OD Premium (Renewed)',
    'Total TP Premium': 'Total TP Premium (Renewed)',
    'Total Premium Payable ': 'Total Premium (Renewed)',
    'Applicable Discount with NCB': 'Average Discount (Renewed)',
    'NCB % Previous Year': 'Average NCB % Previous (Renewed)',
    'gst': 'Total GST (Renewed)',
    'Before GST Add-on GWP': 'Total Add on Premium (Renewed)'
}).reset_index()

not_renewed_metrics = not_renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Total Premium Payable ': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'gst': 'sum',
    'Before GST Add-on GWP': 'sum'
}).rename(columns={
    'Total OD Premium': 'Total OD Premium (Not Renewed)',
    'Total TP Premium': 'Total TP Premium (Not Renewed)',
    'Total Premium Payable ': 'Total Premium (Not Renewed)',
    'Applicable Discount with NCB': 'Average Discount (Not Renewed)',
    'NCB % Previous Year': 'Average NCB % Previous (Not Renewed)',
    'gst': 'Total GST (Not Renewed)',
    'Before GST Add-on GWP': 'Total Add on Premium (Not Renewed)'
}).reset_index()

# Merge Renewed and Not Renewed metrics with customer data
customer_data = pd.merge(customer_data, renewed_metrics, on=['CustomerID', 'Start Year'], how='left')
customer_data = pd.merge(customer_data, not_renewed_metrics, on=['CustomerID', 'Start Year'], how='left')

# Fill NaN values with 0 for aggregated columns
aggregated_columns = [
    'Total OD Premium (Renewed)', 'Total TP Premium (Renewed)', 'Total Premium (Renewed)',
    'Average Discount (Renewed)', 'Average NCB % Previous (Renewed)', 'Total GST (Renewed)', 'Total Add on Premium (Renewed)',
    'Total OD Premium (Not Renewed)', 'Total TP Premium (Not Renewed)', 'Total Premium (Not Renewed)',
    'Average Discount (Not Renewed)', 'Average NCB % Previous (Not Renewed)', 'Total GST (Not Renewed)', 'Total Add on Premium (Not Renewed)'
]
customer_data[aggregated_columns] = customer_data[aggregated_columns].fillna(0)

# Rename 'New Policy No' to 'Number of Policies' for clarity
customer_data.rename(columns={'Initial Policy No': 'Number of Policies'}, inplace=True)

# Calculate 'Not Renewed' and 'Open' directly from 'data'
customer_data['Not Renewed'] = data.groupby(['CustomerID', 'Start Year'])['booked'].apply(lambda x: (x == 0).sum()).values
customer_data['Open'] = data.groupby(['CustomerID', 'Start Year'])['booked'].apply(lambda x: (x == '-').sum()).values
customer_data['Renewed'] = data.groupby(['CustomerID', 'Start Year'])['booked'].apply(lambda x: (x == 1).sum()).values
customer_data['Renewal Rate'] = customer_data['Renewed'] / customer_data['Number of Policies']

# Split the 'status' column into lists of individual statuses
customer_data['status_list'] = customer_data['Policy Status'].apply(lambda x: [s.strip() for s in x.split(',')])

# Aggregate the status lists by 'CustomerID' and 'Year'
grouped_status = customer_data.groupby(['CustomerID', 'Start Year'])['status_list'].sum().reset_index()

# Define the churn function
def determine_churn(status_list):
    return 'Yes' if all(status == 'Not Renewed' for status in status_list) else 'No'

# Apply the churn determination function to each group
grouped_status['Churn Label'] = grouped_status['status_list'].apply(determine_churn)

# Merge the Churn Label back into the original customer_data DataFrame
customer_data = customer_data.merge(grouped_status[['CustomerID', 'Start Year', 'Churn Label']], on=['CustomerID', 'Start Year'])

customer_data.rename(columns={
    'Applicable Discount with NCB': 'Average Discount (Overall)',
    'NCB % Previous Year': 'Average NCB % Previous Year (Overall)',
    'Total Premium Payable ': 'Total Premium Payable (Overall)',
    'Before GST Add-on GWP': 'Total Add on Premium (Overall)',
    'gst': 'Total GST (Overall)',
    'Total OD Premium': 'Total OD Premium (Overall)',
    'Total TP Premium': 'Total TP Premium (Overall)',
    'decline': 'Number of Declines'
}, inplace=True)

customer_data.to_csv('customer_aggregated_yearly_data.csv', index=False)

  booked Policy Status
0      1       Renewed
1      1       Renewed
2      1       Renewed
3      0   Not Renewed
4      1       Renewed
