In [None]:
import pandas as pd

data = pd.read_excel('cleaned_merged_base_data.xlsx')

In [None]:
# Drop rows where 'Next year Policy Start Date' is blank
data = data[data['Next year Policy Start Date'].notna()]

# Extract the year from Next year Policy Start Date for year-wise grouping
data['Start Year'] = pd.to_datetime(data['Next year Policy Start Date']).dt.year

# Map 'Policy Status' directly in the 'data' DataFrame
data['Policy Status'] = data['Next year Booked'].map(lambda x: 'Not Renewed' if x == 0 else 'Open' if x == '-' else 'Renewed')

# Verify the mapping
print(data[['Next year Booked', 'Policy Status']].head())

# Convert relevant columns to numeric type to handle aggregation correctly
numeric_columns = [
    'Next year Total Premium Payable', 'Next year Total OD Premium', 'Next year Total TP Premium', 'Next year NCB % Previous Year', 
    'Next year Nil Depreciation', 'Next year Passenger Assist', 'Next year Consumable Cover', 'Next year Engine Safe Cover',
    'Next year Road Side Assistance', 'Next year Key Loss', 'Next year GST', 'Next year Discount', 'Next year Add on Premium'
]
for column in numeric_columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# Create a unique identifier for each vehicle by combining 'Reg no' and 'Variant'
data['Number of Vehicles'] = data['Reg no '].str.strip() + '_' + data['Variant'].str.strip()

# Fill NaN and empty values with a placeholder 'Empty'
data['New Policy No'] = data['New Policy No'].fillna('Empty').replace('', 'Empty')

# Perform grouping and aggregation
customer_data = data.groupby(['CustomerID', 'Start Year']).agg({
    'Cleaned_Insured name': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Reg no ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'MANUFACTURER/Make': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Model': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Variant': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Fuel Type': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'RTO Location ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Product name ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Product name  2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Biztype': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Next year Age': lambda x: ', '.join(x.dropna().astype(str)),
    'Vehicle Segment': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Number of Vehicles': 'nunique',
    'Next year Policy Start Date': 'min',
    'Next year Policy End Date': 'max',
    'New Policy No': 'nunique',  
    'Next year Total Premium Payable': 'sum',
    'Next year Total OD Premium': 'sum',
    'Next year Total TP Premium': 'sum',
    'Next year Add on Premium' : 'sum',
    'Next year GST': 'sum',
    'Next year Discount': 'mean',
    'Next year NCB % Previous Year': 'mean',
    'Next year Nil Depreciation': 'sum',
    'Next year Passenger Assist': 'sum',
    'Next year Consumable Cover': 'sum',
    'Next year Engine Safe Cover': 'sum',
    'Next year Road Side Assistance': 'sum',
    'Next year Key Loss': 'sum',
    'Next year claim': 'sum',
    'New Branch Name  2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Zone 2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'State2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Policy Status': lambda x: ', '.join(x.dropna().astype(str))
}).reset_index()

# Calculate metrics for Renewed and Not Renewed policies separately
renewed_data = data[data['Policy Status'] == 'Renewed']
not_renewed_data = data[data['Policy Status'] == 'Not Renewed']

renewed_metrics = renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Next year Total OD Premium': 'sum',
    'Next year Total TP Premium': 'sum',
    'Next year Total Premium Payable': 'sum',
    'Next year Discount': 'mean',
    'Next year NCB % Previous Year': 'mean',
    'Next year GST': 'sum',
    'Next year Add on Premium': 'sum'
}).rename(columns={
    'Next year Total OD Premium': 'Total OD Premium (Renewed)',
    'Next year Total TP Premium': 'Total TP Premium (Renewed)',
    'Next year Total Premium Payable': 'Total Premium (Renewed)',
    'Next year Discount': 'Average Discount (Renewed)',
    'Next year NCB % Previous Year': 'Average NCB % Previous (Renewed)',
    'Next year GST': 'Total GST (Renewed)',
    'Next year Add on Premium': 'Total Add on Premium (Renewed)'
}).reset_index()

not_renewed_metrics = not_renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Next year Total OD Premium': 'sum',
    'Next year Total TP Premium': 'sum',
    'Next year Total Premium Payable': 'sum',
    'Next year Discount': 'mean',
    'Next year NCB % Previous Year': 'mean',
    'Next year GST': 'sum',
    'Next year Add on Premium': 'sum'
}).rename(columns={
    'Next year Total OD Premium': 'Total OD Premium (Not Renewed)',
    'Next year Total TP Premium': 'Total TP Premium (Not Renewed)',
    'Next year Total Premium Payable': 'Total Premium (Not Renewed)',
    'Next year Discount': 'Average Discount (Not Renewed)',
    'Next year NCB % Previous Year': 'Average NCB % Previous (Not Renewed)',
    'Next year GST': 'Total GST (Not Renewed)',
    'Next year Add on Premium': 'Total Add on Premium (Not Renewed)'
}).reset_index()

# Merge Renewed and Not Renewed metrics with customer data
customer_data = pd.merge(customer_data, renewed_metrics, on=['CustomerID', 'Start Year'], how='left')
customer_data = pd.merge(customer_data, not_renewed_metrics, on=['CustomerID', 'Start Year'], how='left')

# Fill NaN values with 0 for aggregated columns
aggregated_columns = [
    'Total OD Premium (Renewed)', 'Total TP Premium (Renewed)', 'Total Premium (Renewed)',
    'Average Discount (Renewed)', 'Average NCB % Previous (Renewed)', 'Total GST (Renewed)', 'Total Add on Premium (Renewed)',
    'Total OD Premium (Not Renewed)', 'Total TP Premium (Not Renewed)', 'Total Premium (Not Renewed)',
    'Average Discount (Not Renewed)', 'Average NCB % Previous (Not Renewed)', 'Total GST (Not Renewed)', 'Total Add on Premium (Not Renewed)'
]
customer_data[aggregated_columns] = customer_data[aggregated_columns].fillna('')

customer_data.rename(columns={'New Policy No': 'Number of Policies'}, inplace=True)

# Calculate 'Not Renewed' and 'Open' directly from 'data'
customer_data['Not Renewed'] = data.groupby(['CustomerID', 'Start Year'])['Next year Booked'].apply(lambda x: (x == 0).sum()).values
customer_data['Open'] = data.groupby(['CustomerID', 'Start Year'])['Next year Booked'].apply(lambda x: (x == '-').sum()).values
customer_data['Renewed'] = 0  # Set Renewed as 0 since there are no renewed cases in 'Next Year Booked'
customer_data['Renewal Rate'] = customer_data['Renewed'] / customer_data['Number of Policies']

# Split the 'status' column into lists of individual statuses
customer_data['status_list'] = customer_data['Policy Status'].apply(lambda x: [s.strip() for s in x.split(',')])

# Aggregate the status lists by 'CustomerID' and 'Year'
grouped_status = customer_data.groupby(['CustomerID', 'Start Year'])['status_list'].sum().reset_index()

# Define the churn function
def determine_churn(status_list):
    return 'Yes' if all(status == 'Not Renewed' for status in status_list) else 'No'

# Apply the churn determination function to each group
grouped_status['Churn Label'] = grouped_status['status_list'].apply(determine_churn)

# Merge the Churn Label back into the original customer_data DataFrame
customer_data = customer_data.merge(grouped_status[['CustomerID', 'Start Year', 'Churn Label']], on=['CustomerID', 'Start Year'])

customer_data.rename(columns={
    'Next year Discount': 'Average Discount (Overall)',
    'Next year NCB % Previous Year': 'Average NCB % Previous Year (Overall)',
    'Next year Policy Start Date': 'Policy Start Date',
    'Next year Policy End Date': 'Policy End Date',
    'Next year Total Premium Payable': 'Total Premium Payable (Overall)',
    'Next year Nil Depreciation': 'Nil Depreciation',
    'Next year Passenger Assist': 'Passenger Assist',
    'Next year Consumable Cover': 'Consumable Cover',
    'Next year Engine Safe Cover': 'Engine Safe Cover',
    'Next year Road Side Assistance': 'Road Side Assistance',
    'Next year Key Loss': 'Key Loss',
    'Next year Age': 'Age',
    'Next year Add on Premium': 'Total Add on Premium (Overall)',
    'Next year GST': 'Total GST (Overall)',
    'Next year Total OD Premium': 'Total OD Premium (Overall)',
    'Next year Total TP Premium': 'Total TP Premium (Overall)',
    'Next year claim': 'Number of Claims'
}, inplace=True)

# Save the final aggregated data to a CSV file
customer_data.to_csv('customer_aggregated_yearly_data (RH).csv', index=False)

   Next year Booked Policy Status
3                 0   Not Renewed
6                 0   Not Renewed
7                 0   Not Renewed
8                 0   Not Renewed
10                0   Not Renewed


In [None]:
import pandas as pd

data = pd.read_excel('cleaned_merged_base_data.xlsx')

In [None]:
# Extract the year from Next year Policy Start Date for year-wise grouping
data['Start Year'] = pd.to_datetime(data['Policy Start Date']).dt.year

# Map 'Policy Status' directly in the 'data' DataFrame
data['Policy Status'] = data['BOOKED'].map(lambda x: 'Not Renewed' if x == 0 else 'Open' if x == '-' else 'Renewed')

# Verify the mapping
print(data[['BOOKED', 'Policy Status']].head())

# Convert relevant columns to numeric type to handle aggregation correctly
numeric_columns = [
    'Total Premium Payable ', 'Total OD Premium', 'Total TP Premium', 'NCB % Previous Year', 
    'Nil Depreciation', 'Passenger Assist', 'Consumable Cover', 'Engine Safe Cover',
    'Road Side Assistance', 'Key Loss', 'GST', 'Applicable Discount with NCB', 'Before GST Add-on GWP'
]
for column in numeric_columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# Create a unique identifier for each vehicle by combining 'Reg no' and 'Variant'
data['Number of Vehicles'] = data['Reg no '].str.strip() + '_' + data['Variant'].str.strip()

# Perform grouping and aggregation
customer_data = data.groupby(['CustomerID', 'Start Year']).agg({
    'Cleaned_Insured name': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Reg no ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'MANUFACTURER/Make': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Model': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Variant': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Fuel Type': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'RTO Location ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Product name ': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Product name  2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Biztype': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Age': lambda x: ', '.join(x.dropna().astype(str)),
    'Vehicle Segment': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Number of Vehicles': 'nunique',
    'Policy Start Date': 'min',
    'Policy End Date': 'max',
    'Initial Policy No': 'nunique',  
    'Total Premium Payable ': 'sum',
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Before GST Add-on GWP' : 'sum',
    'GST': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'Nil Depreciation': 'sum',
    'Passenger Assist': 'sum',
    'Consumable Cover': 'sum',
    'Engine Safe Cover': 'sum',
    'Road Side Assistance': 'sum',
    'Key Loss': 'sum',
    'Claim in last year': lambda x: x[x == 'Yes'].count(),
    'New Branch Name  2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Zone 2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'State2': lambda x: ', '.join(x.dropna().unique().astype(str)),
    'Policy Status': lambda x: ', '.join(x.dropna().astype(str))
}).reset_index()

# Calculate metrics for Renewed and Not Renewed policies separately
renewed_data = data[data['Policy Status'] == 'Renewed']
not_renewed_data = data[data['Policy Status'] == 'Not Renewed']

renewed_metrics = renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Total Premium Payable ': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'GST': 'sum',
    'Before GST Add-on GWP': 'sum'
}).rename(columns={
    'Total OD Premium': 'Total OD Premium (Renewed)',
    'Total TP Premium': 'Total TP Premium (Renewed)',
    'Total Premium Payable ': 'Total Premium (Renewed)',
    'Applicable Discount with NCB': 'Average Discount (Renewed)',
    'NCB % Previous Year': 'Average NCB % Previous (Renewed)',
    'GST': 'Total GST (Renewed)',
    'Before GST Add-on GWP': 'Total Add on Premium (Renewed)'
}).reset_index()

not_renewed_metrics = not_renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Total Premium Payable ': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'GST': 'sum',
    'Before GST Add-on GWP': 'sum'
}).rename(columns={
    'Total OD Premium': 'Total OD Premium (Not Renewed)',
    'Total TP Premium': 'Total TP Premium (Not Renewed)',
    'Total Premium Payable ': 'Total Premium (Not Renewed)',
    'Applicable Discount with NCB': 'Average Discount (Not Renewed)',
    'NCB % Previous Year': 'Average NCB % Previous (Not Renewed)',
    'GST': 'Total GST (Not Renewed)',
    'Before GST Add-on GWP': 'Total Add on Premium (Not Renewed)'
}).reset_index()

# Merge Renewed and Not Renewed metrics with customer data
customer_data = pd.merge(customer_data, renewed_metrics, on=['CustomerID', 'Start Year'], how='left')
customer_data = pd.merge(customer_data, not_renewed_metrics, on=['CustomerID', 'Start Year'], how='left')

# Fill NaN values with 0 for aggregated columns
aggregated_columns = [
    'Total OD Premium (Renewed)', 'Total TP Premium (Renewed)', 'Total Premium (Renewed)',
    'Average Discount (Renewed)', 'Average NCB % Previous (Renewed)', 'Total GST (Renewed)', 'Total Add on Premium (Renewed)',
    'Total OD Premium (Not Renewed)', 'Total TP Premium (Not Renewed)', 'Total Premium (Not Renewed)',
    'Average Discount (Not Renewed)', 'Average NCB % Previous (Not Renewed)', 'Total GST (Not Renewed)', 'Total Add on Premium (Not Renewed)'
]
customer_data[aggregated_columns] = customer_data[aggregated_columns].fillna('')

# Rename 'New Policy No' to 'Number of Policies' for clarity
customer_data.rename(columns={'Initial Policy No': 'Number of Policies'}, inplace=True)

# Calculate 'Not Renewed' and 'Open' directly from 'data'
customer_data['Not Renewed'] = data.groupby(['CustomerID', 'Start Year'])['BOOKED'].apply(lambda x: (x == 0).sum()).values
customer_data['Open'] = data.groupby(['CustomerID', 'Start Year'])['BOOKED'].apply(lambda x: (x == '-').sum()).values
customer_data['Renewed'] = data.groupby(['CustomerID', 'Start Year'])['BOOKED'].apply(lambda x: (x == 1).sum()).values
customer_data['Renewal Rate'] = customer_data['Renewed'] / customer_data['Number of Policies']

# Split the 'status' column into lists of individual statuses
customer_data['status_list'] = customer_data['Policy Status'].apply(lambda x: [s.strip() for s in x.split(',')])

# Aggregate the status lists by 'CustomerID' and 'Year'
grouped_status = customer_data.groupby(['CustomerID', 'Start Year'])['status_list'].sum().reset_index()

# Define the churn function
def determine_churn(status_list):
    return 'Yes' if all(status == 'Not Renewed' for status in status_list) else 'No'

# Apply the churn determination function to each group
grouped_status['Churn Label'] = grouped_status['status_list'].apply(determine_churn)

# Merge the Churn Label back into the original customer_data DataFrame
customer_data = customer_data.merge(grouped_status[['CustomerID', 'Start Year', 'Churn Label']], on=['CustomerID', 'Start Year'])

customer_data.rename(columns={
    'Applicable Discount with NCB': 'Average Discount (Overall)',
    'NCB % Previous Year': 'Average NCB % Previous Year (Overall)',
    'Total Premium Payable ': 'Total Premium Payable (Overall)',
    'Before GST Add-on GWP': 'Total Add on Premium (Overall)',
    'GST': 'Total GST (Overall)',
    'Total OD Premium': 'Total OD Premium (Overall)',
    'Total TP Premium': 'Total TP Premium (Overall)',
    'Claim in last year': 'Number of Claims'
}, inplace=True)

# Save the final aggregated data to a CSV file
customer_data.to_csv('customer_aggregated_yearly_data (base data).csv', index=False)

   BOOKED Policy Status
0       1       Renewed
1       1       Renewed
2       1       Renewed
3       1       Renewed
4       1       Renewed
