In [1]:
import pandas as pd

df = pd.read_excel('customer_aggregated_yearly_data.xlsx')

In [None]:
import pandas as pd
from datetime import timedelta

# Convert date columns to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], format='%d-%m-%Y', errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], format='%d-%m-%Y', errors='coerce')

# Drop rows with invalid dates
#df = df.dropna(subset=['POLICYSTARTDATE', 'POLICYENDDATE'])

# Sort by CustomerID and POLICYSTARTDATE to ensure policies are processed sequentially
df = df.sort_values(by=['CustomerID', 'Policy Start Date'])

# Initialize the list to hold tenure results
customer_tenure = []

# Process each customer group based on CustomerID
for customer_id, group in df.groupby('CustomerID'):
    group = group.sort_values(by='Policy Start Date')
    total_days = 0

    # Iterate through the policies sequentially
    for i in range(len(group)):
        if i == 0:
            total_days += (group.iloc[i]['Policy End Date'] - group.iloc[i]['Policy Start Date']).days
        else:
            prev_policy_end = group.iloc[i-1]['Policy End Date']
            current_policy_start = group.iloc[i]['Policy Start Date']
            current_policy_end = group.iloc[i]['Policy End Date']

            if pd.notna(prev_policy_end) and current_policy_start <= prev_policy_end + timedelta(days=1):
                total_days += (current_policy_end - prev_policy_end).days
            else:
                total_days += (current_policy_end - current_policy_start).days

    # Calculate tenure in months
    tenure_months = round(total_days / 30.44)
    customer_tenure.append({'CustomerID': customer_id, 'TenureMonths': tenure_months})

# Create a DataFrame for tenure and merge it back
tenure_df = pd.DataFrame(customer_tenure)
df = df.merge(tenure_df, on='CustomerID', how='left')

# Extract the year from start and end dates
df['Start_Year'] = df['Policy Start Date'].dt.year
df['End_Year'] = df['Policy End Date'].dt.year

# Identify year-wise new customers
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start_Year'].transform('min')

# Generate the year-wise new customer column
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start_Year'] == row['FirstPolicyYear'] else '',
    axis=1
)

df.to_csv("Updated_customer_aggregated_yearly_data.csv", index=False)