In [1]:
import pandas as pd

data = pd.read_excel('cleaned_merged_base_data.xlsx')

In [None]:
import pandas as pd

# Convert 'Policy Start Date' and 'Policy End Date' to datetime
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'])
data['Policy End Date'] = pd.to_datetime(data['Policy End Date'])

# Convert relevant columns to numeric type to handle aggregation correctly
numeric_columns = [
    'Total Premium Payable ', 'Total OD Premium', 'Total TP Premium', 'NCB % Previous Year', 
    'Nil Depreciation', 'Passenger Assist', 'Consumable Cover', 'Engine Safe Cover', 'Claim in last year',
    'Road Side Assistance', 'Key Loss', 'gst', 'Applicable Discount with NCB', 'Before GST Add-on GWP'
]

# Ensure all numeric columns are clean
for column in numeric_columns:
    data[column] = pd.to_numeric(data[column], errors='coerce').fillna(0)

# Custom aggregation logic for overlapping policies and field aggregation
aggregated_policies = []
for customer_id, group in data.groupby('CustomerID'):
    group = group.reset_index(drop=True)

    # Initialize variables for aggregation
    start_date, end_date = None, None
    total_premium, total_od, total_tp = 0, 0, 0
    fields_to_aggregate = {
        'Cleaned_Insured name': set(),
        'Reg no ': set(),
        'MANUFACTURER/Make': set(),
        'model': set(),
        'variant': set(),
        'Fuel Type': set(),
        'RTO Location ': set(),
        'Product name ': set(),
        'Product name  2': set(),
        'biztype': set(),
        'Renewal Type': set(),
        'age': set(),
        'Vehicle Segment': set(),
        'New Branch Name  2': set(),
        'Zone 2': set(),
        'state2': set()
    }
    numeric_aggregations = {key: 0.0 for key in numeric_columns}

    for _, row in group.iterrows():
        try:
            if start_date is None:  # Initialize the first period
                start_date = row['Policy Start Date']
                end_date = row['Policy End Date']
                total_premium = row['Total Premium Payable ']
                total_od = row['Total OD Premium']
                total_tp = row['Total TP Premium']

                # Collect initial field values
                for field in fields_to_aggregate:
                    if pd.notna(row[field]):
                        fields_to_aggregate[field].add(str(row[field]))
                for key in numeric_aggregations:
                    numeric_aggregations[key] += row[key]

            else:
                # Check for overlap
                if row['Policy Start Date'] <= end_date:
                    # Extend the end date and aggregate numeric values
                    end_date = max(end_date, row['Policy End Date'])
                    total_premium += row['Total Premium Payable ']
                    total_od += row['Total OD Premium']
                    total_tp += row['Total TP Premium']
                else:
                    # Save the current period
                    aggregated_policies.append([
                        customer_id, start_date, end_date, total_premium, total_od, total_tp,
                        *[', '.join(fields_to_aggregate[field]) for field in fields_to_aggregate],
                        *[numeric_aggregations[key] for key in numeric_columns]
                    ])

                    # Reset for next period
                    start_date = row['Policy Start Date']
                    end_date = row['Policy End Date']
                    total_premium = row['Total Premium Payable ']
                    total_od = row['Total OD Premium']
                    total_tp = row['Total TP Premium']

                    # Reset fields
                    for field in fields_to_aggregate:
                        fields_to_aggregate[field] = {str(row[field])} if pd.notna(row[field]) else set()
                    for key in numeric_aggregations:
                        numeric_aggregations[key] = row[key]

        except Exception as e:
            print(f"Error processing row: {row.to_dict()}, Error: {e}")

    # Append the final period
    aggregated_policies.append([
        customer_id, start_date, end_date, total_premium, total_od, total_tp,
        *[', '.join(fields_to_aggregate[field]) for field in fields_to_aggregate],
        *[numeric_aggregations[key] for key in numeric_columns]
    ])

# Convert the aggregated results into a DataFrame
columns = [
    'CustomerID', 'Policy Start Date', 'Policy End Date', 'Total Premium Payable ',
    'Total OD Premium', 'Total TP Premium', *fields_to_aggregate.keys(), *numeric_columns
]
aggregated_policies_df = pd.DataFrame(aggregated_policies, columns=columns)

# Save the final aggregated data to a CSV file
aggregated_policies_df.to_csv('customer_aggregated_policies_check.csv', index=False)

# Output the final DataFrame
print(aggregated_policies_df)

In [1]:
import pandas as pd

data = pd.read_excel('cleaned_merged_base_data.xlsx')

In [2]:
# Convert 'Policy Start Date' and 'Policy End Date' to datetime
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'])
data['Policy End Date'] = pd.to_datetime(data['Policy End Date'])

# Convert relevant columns to numeric type
numeric_columns = [
    'Total Premium Payable ', 'Total OD Premium', 'Total TP Premium', 'NCB % Previous Year', 
    'Nil Depreciation', 'Passenger Assist', 'Consumable Cover', 'Engine Safe Cover', 
    'Claim in last year', 'Road Side Assistance', 'Key Loss', 'gst', 
    'Applicable Discount with NCB', 'Before GST Add-on GWP'
]
for column in numeric_columns:
    data[column] = pd.to_numeric(data[column], errors='coerce').fillna(0)

# Step 1: Identify overlapping policies and aggregate dates
aggregated_policies = []
for customer_id, group in data.groupby('CustomerID'):
    group = group.sort_values(by='Policy Start Date').reset_index(drop=True)

    # Initialize variables
    start_date, end_date = None, None
    policy_keys = []

    for _, row in group.iterrows():
        if start_date is None:
            # Initialize the first policy
            start_date = row['Policy Start Date']
            end_date = row['Policy End Date']
            policy_keys = [row.name]  # Keep track of the row indices
        else:
            if row['Policy Start Date'] <= end_date:
                # Extend the end date if overlapping
                end_date = max(end_date, row['Policy End Date'])
                policy_keys.append(row.name)
            else:
                # Non-overlapping policy: save the current aggregated period
                aggregated_policies.append({
                    'CustomerID': customer_id,
                    'Policy Start Date': start_date,
                    'Policy End Date': end_date,
                    'Policy Keys': policy_keys
                })
                # Reset for the next policy
                start_date = row['Policy Start Date']
                end_date = row['Policy End Date']
                policy_keys = [row.name]

    # Append the last aggregated period
    aggregated_policies.append({
        'CustomerID': customer_id,
        'Policy Start Date': start_date,
        'Policy End Date': end_date,
        'Policy Keys': policy_keys
    })

# Step 2: Aggregate fields based on the date ranges
aggregated_results = []
for policy in aggregated_policies:
    customer_id = policy['CustomerID']
    start_date = policy['Policy Start Date']
    end_date = policy['Policy End Date']
    policy_rows = data.loc[policy['Policy Keys']]

    aggregated_result = {
        'CustomerID': customer_id,
        'Policy Start Date': start_date,
        'Policy End Date': end_date,
        'Total Premium Payable ': policy_rows['Total Premium Payable '].sum(),
        'Total OD Premium': policy_rows['Total OD Premium'].sum(),
        'Total TP Premium': policy_rows['Total TP Premium'].sum()
    }

    # Add aggregated fields
    other_aggregations = policy_rows.agg({
        'Cleaned_Insured name': lambda x: ', '.join(x.dropna().unique()),
        'Reg no ': lambda x: ', '.join(x.dropna().unique()),
        'MANUFACTURER/Make': lambda x: ', '.join(x.dropna().unique()),
        'model': lambda x: ', '.join(x.dropna().unique()),
        'variant': lambda x: ', '.join(x.dropna().unique()),
        'Fuel Type': lambda x: ', '.join(x.dropna().unique()),
        'RTO Location ': lambda x: ', '.join(x.dropna().unique()),
        'Product name ': lambda x: ', '.join(x.dropna().unique()),
        'Product name  2': lambda x: ', '.join(x.dropna().unique()),
        'biztype': lambda x: ', '.join(x.dropna().unique()),
        'Renewal Type': lambda x: ', '.join(x.dropna().unique()),
        'age': lambda x: ', '.join(x.dropna().astype(str)),
        'Vehicle Segment': lambda x: ', '.join(x.dropna().unique()),
        'New Branch Name  2': lambda x: ', '.join(x.dropna().unique()),
        'Zone 2': lambda x: ', '.join(x.dropna().unique()),
        'state2': lambda x: ', '.join(x.dropna().unique()),
        'Before GST Add-on GWP': 'sum',
        'gst': 'sum',
        'Applicable Discount with NCB': 'mean',
        'NCB % Previous Year': 'mean',
        'Nil Depreciation': 'sum',
        'Passenger Assist': 'sum',
        'Consumable Cover': 'sum',
        'Engine Safe Cover': 'sum',
        'Road Side Assistance': 'sum',
        'Key Loss': 'sum',
        'Number of Claims': 'sum'
    }).to_dict()

    aggregated_result.update(other_aggregations)
    aggregated_results.append(aggregated_result)

# Convert aggregated results to a DataFrame
final_df = pd.DataFrame(aggregated_results)

# Save the final aggregated data to a CSV file
final_df.to_csv('customer_aggregated_policies_check.csv', index=False)

# Output the final DataFrame
print(final_df)

       CustomerID Policy Start Date Policy End Date  Total Premium Payable   \
0          C00001        2023-07-14      2024-07-13                     0.0   
1          C00002        2023-05-04      2024-05-03                     0.0   
2          C00003        2022-12-23      2023-12-22                     0.0   
3          C00004        2022-09-17      2023-09-16                     0.0   
4          C00005        2023-06-27      2024-06-26                     0.0   
...           ...               ...             ...                     ...   
781002     C99996        2023-12-30      2024-12-29                 15805.0   
781003     C99997        2022-10-01      2023-09-30                     0.0   
781004     C99998        2022-11-24      2023-11-23                     0.0   
781005     C99998        2023-11-24      2024-11-23                 15805.0   
781006     C99999        2023-06-02      2024-06-01                     0.0   

        Total OD Premium  Total TP Premium Cleaned_

In [3]:
import pandas as pd

data = pd.read_excel('cleaned_merged_base_data.xlsx')

In [6]:
# Convert 'Policy Start Date' and 'Policy End Date' to datetime
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'])
data['Policy End Date'] = pd.to_datetime(data['Policy End Date'])

# Extract Start Year from Policy Start Date
data['Start Year'] = data['Policy Start Date'].dt.year

# Map 'Policy Status' directly in the 'data' DataFrame
data['Policy Status'] = data['booked'].map(lambda x: 'Not Renewed' if x == 0 else 'Renewed' if x == 1 else 'Open')

# Convert specific columns to strings for concatenation
string_columns = [
    'Cleaned_Insured name', 'Reg no ', 'MANUFACTURER/Make', 'model', 'variant',
    'Fuel Type', 'RTO Location ', 'Product name ', 'Product name  2',
    'biztype', 'Renewal Type', 'age', 'Vehicle Segment', 'New Branch Name  2',
    'Zone 2', 'state2', 'Policy Status'
]
for col in string_columns:
    data[col] = data[col].astype(str)

# Step 1: Year-Wise Aggregation
yearly_aggregated = data.groupby(['CustomerID', 'Start Year']).agg({
    'Policy Start Date': 'min',
    'Policy End Date': 'max',
    'Total Premium Payable ': 'sum',
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Cleaned_Insured name': lambda x: ', '.join(x.dropna().unique()),
    'Reg no ': lambda x: ', '.join(x.dropna().unique()),
    'MANUFACTURER/Make': lambda x: ', '.join(x.dropna().unique()),
    'model': lambda x: ', '.join(x.dropna().unique()),
    'variant': lambda x: ', '.join(x.dropna().unique()),
    'Fuel Type': lambda x: ', '.join(x.dropna().unique()),
    'RTO Location ': lambda x: ', '.join(x.dropna().unique()),
    'Product name ': lambda x: ', '.join(x.dropna().unique()),
    'Product name  2': lambda x: ', '.join(x.dropna().unique()),
    'biztype': lambda x: ', '.join(x.dropna().unique()),
    'Renewal Type': lambda x: ', '.join(x.dropna().unique()),
    'age': lambda x: ', '.join(x.dropna().astype(str)),  # Ensure numeric ages are converted
    'Vehicle Segment': lambda x: ', '.join(x.dropna().unique()),
    'New Branch Name  2': lambda x: ', '.join(x.dropna().unique()),
    'Zone 2': lambda x: ', '.join(x.dropna().unique()),
    'state2': lambda x: ', '.join(x.dropna().unique()),
    'Before GST Add-on GWP': 'sum',
    'gst': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'Nil Depreciation': 'sum',
    'Passenger Assist': 'sum',
    'Consumable Cover': 'sum',
    'Engine Safe Cover': 'sum',
    'Road Side Assistance': 'sum',
    'Key Loss': 'sum',
    'Number of Claims': 'sum',
    'decline': lambda x: x[x == 'Yes'].count(),
    'Policy Status': lambda x: ', '.join(x.dropna().astype(str))  # Ensure status is string
}).reset_index()

# Step 2: Identify Overlapping Policies Across Years
aggregated_policies = []
for customer_id, group in yearly_aggregated.groupby('CustomerID'):
    group = group.sort_values(by='Start Year').reset_index(drop=True)
    
    # Initialize variables
    start_date, end_date = None, None
    combined_keys = []

    for _, row in group.iterrows():
        if start_date is None:
            # Initialize the first period
            start_date = row['Policy Start Date']
            end_date = row['Policy End Date']
            combined_keys = [row.name]
        else:
            if end_date == row['Policy Start Date']:
                # Extend the end date for overlapping periods
                end_date = max(end_date, row['Policy End Date'])
                combined_keys.append(row.name)
            else:
                # Save the aggregated period
                aggregated_policies.append({
                    'CustomerID': customer_id,
                    'Policy Start Date': start_date,
                    'Policy End Date': end_date,
                    'Policy Keys': combined_keys
                })
                # Reset for the next period
                start_date = row['Policy Start Date']
                end_date = row['Policy End Date']
                combined_keys = [row.name]

    # Append the last period
    aggregated_policies.append({
        'CustomerID': customer_id,
        'Policy Start Date': start_date,
        'Policy End Date': end_date,
        'Policy Keys': combined_keys
    })

# Step 3: Re-aggregate Fields Based on Combined Keys
final_aggregations = []
for policy in aggregated_policies:
    customer_id = policy['CustomerID']
    policy_rows = yearly_aggregated.iloc[policy['Policy Keys']]

    aggregated_result = {
        'CustomerID': customer_id,
        'Policy Start Date': policy['Policy Start Date'],
        'Policy End Date': policy['Policy End Date'] 
    }

    # Combine categorical fields
    other_aggregations = policy_rows.agg({
        'Total Premium Payable ': 'sum',
        'Total OD Premium': 'sum',
        'Total TP Premium': 'sum',
        'Cleaned_Insured name': lambda x: ', '.join(x.dropna().unique()),
        'Reg no ': lambda x: ', '.join(x.dropna().unique()),
        'MANUFACTURER/Make': lambda x: ', '.join(x.dropna().unique()),
        # Add all other fields as necessary
        'model': lambda x: ', '.join(x.dropna().unique()),
        'variant': lambda x: ', '.join(x.dropna().unique()),
        'Fuel Type': lambda x: ', '.join(x.dropna().unique()),
        'RTO Location ': lambda x: ', '.join(x.dropna().unique()),
        'Product name ': lambda x: ', '.join(x.dropna().unique()),
        'Product name  2': lambda x: ', '.join(x.dropna().unique()),
        'biztype': lambda x: ', '.join(x.dropna().unique()),
        'Renewal Type': lambda x: ', '.join(x.dropna().unique()),
        'age': lambda x: ', '.join(x.dropna().astype(str)),
        'Vehicle Segment': lambda x: ', '.join(x.dropna().unique()),
        'New Branch Name  2': lambda x: ', '.join(x.dropna().unique()),
        'Zone 2': lambda x: ', '.join(x.dropna().unique()),
        'state2': lambda x: ', '.join(x.dropna().unique()),
        'Before GST Add-on GWP': 'sum',
        'gst': 'sum',
        'Applicable Discount with NCB': 'mean',
        'NCB % Previous Year': 'mean',
        'Nil Depreciation': 'sum',
        'Passenger Assist': 'sum',
        'Consumable Cover': 'sum',
        'Engine Safe Cover': 'sum',
        'Road Side Assistance': 'sum',
        'Key Loss': 'sum',
        'Number of Claims': 'sum',
        'decline': lambda x: x[x == 'Yes'].count(),
        'Policy Status': lambda x: ', '.join(x.dropna().astype(str))
    }).to_dict()

    aggregated_result.update(other_aggregations)
    final_aggregations.append(aggregated_result)

# Convert to final DataFrame
final_df = pd.DataFrame(final_aggregations)

# Calculate metrics for Renewed and Not Renewed policies
renewed_data = data[data['Policy Status'] == 'Renewed']
not_renewed_data = data[data['Policy Status'] == 'Not Renewed']

renewed_metrics = renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Total Premium Payable ': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'gst': 'sum',
    'Before GST Add-on GWP': 'sum'
}).rename(columns={
    'Total OD Premium': 'Total OD Premium (Renewed)',
    'Total TP Premium': 'Total TP Premium (Renewed)',
    'Total Premium Payable ': 'Total Premium (Renewed)',
    'Applicable Discount with NCB': 'Average Discount (Renewed)',
    'NCB % Previous Year': 'Average NCB % Previous (Renewed)',
    'gst': 'Total GST (Renewed)',
    'Before GST Add-on GWP': 'Total Add on Premium (Renewed)'
}).reset_index()

not_renewed_metrics = not_renewed_data.groupby(['CustomerID', 'Start Year']).agg({
    'Total OD Premium': 'sum',
    'Total TP Premium': 'sum',
    'Total Premium Payable ': 'sum',
    'Applicable Discount with NCB': 'mean',
    'NCB % Previous Year': 'mean',
    'gst': 'sum',
    'Before GST Add-on GWP': 'sum'
}).rename(columns={
    'Total OD Premium': 'Total OD Premium (Not Renewed)',
    'Total TP Premium': 'Total TP Premium (Not Renewed)',
    'Total Premium Payable ': 'Total Premium (Not Renewed)',
    'Applicable Discount with NCB': 'Average Discount (Not Renewed)',
    'NCB % Previous Year': 'Average NCB % Previous (Not Renewed)',
    'gst': 'Total GST (Not Renewed)',
    'Before GST Add-on GWP': 'Total Add on Premium (Not Renewed)'
}).reset_index()

# Merge Renewed and Not Renewed metrics with the final DataFrame
final_df = pd.merge(final_df, renewed_metrics, on=['CustomerID', 'Start Year'], how='left')
final_df = pd.merge(final_df, not_renewed_metrics, on=['CustomerID', 'Start Year'], how='left')

# Save the results
final_df.to_csv('customer_aggregated_final check.csv', index=False)

# Output the final DataFrame
print(final_df)

KeyError: 'Start Year'