In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)

df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))

# CustomerID format starting from 1000001
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
# Preprocess the 'booked' column to standardize values
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Calculate policy tenure in months
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Calculate cumulative tenure in months for each customer
df['Cumulative Tenure (Months)'] = df.groupby('CustomerID')['Policy Tenure Month'].cumsum()

# Calculate tenure in decimal and rounded customer tenure
df['Tenure Decimal'] = df['Cumulative Tenure (Months)'] / 12
df['Customer Tenure'] = df['Tenure Decimal'].round(0)

# Extract the year from start and end dates
df['Start Year'] = df['Policy Start Date'].dt.year
df['End Year'] = df['Policy End Date'].dt.year

# Identify year-wise new customers
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')

# Generate the year-wise new customer column
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)

# Add New Customers column
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Calculate Renewal Rate Status
df['Renewal Rate Status'] = 'Null'  # Default
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    group = group.sort_values(by=['Policy Start Date'])  # Ensure sorted by start date
    
    for i in range(1, len(group)):  # Start from the second policy
        current_policy = group.iloc[i]
        previous_policy = group.iloc[i - 1]
        
        # Compare net premium to determine renewal rate status
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Function to calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()

    # If all statuses in a given year for the customer are 'Not Renewed'
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

# Apply churn status year-wise for each customer
df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)

df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))

# CustomerID format starting from 1000001
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Step 3: Calculate year-wise tenure for each customer
df['Start Year'] = df['Policy Start Date'].dt.year

# Group by CustomerID and Start Year to calculate year-wise minimum start date and maximum end date
yearly_tenure = df.groupby(['CustomerID', 'Start Year']).agg(
    MinStartDate=('Policy Start Date', 'min'),
    MaxEndDate=('Policy End Date', 'max')
).reset_index()

# Calculate tenure in months for each year
yearly_tenure['Yearly Tenure (Months)'] = ((yearly_tenure['MaxEndDate'].dt.year - yearly_tenure['MinStartDate'].dt.year) * 12 +
                                           (yearly_tenure['MaxEndDate'].dt.month - yearly_tenure['MinStartDate'].dt.month))

# Step 4: Map yearly tenure back to the original dataframe
df = df.merge(yearly_tenure[['CustomerID', 'Start Year', 'Yearly Tenure (Months)']],
              on=['CustomerID', 'Start Year'], how='left')

# Step 5: Calculate cumulative tenure for each customer
df['Cumulative Tenure (Months)'] = df.groupby(['CustomerID', 'Start Year'])['Yearly Tenure (Months)'].cumsum()

# Step 6: Calculate tenure in years (decimal and rounded)
df['Tenure Decimal'] = df['Cumulative Tenure (Months)'] / 12
df['Customer Tenure'] = df['Tenure Decimal'].round(0)

# Extract the year from start and end dates for other calculations
df['End Year'] = df['Policy End Date'].dt.year

# Identify year-wise new customers
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')

# Generate the year-wise new customer column
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)

# Add New Customers column
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 7: Calculate Renewal Rate Status
df['Renewal Rate Status'] = 'Null'  # Default
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    group = group.sort_values(by=['Policy Start Date'])  # Ensure sorted by start date
    
    for i in range(1, len(group)):  # Start from the second policy
        current_policy = group.iloc[i]
        previous_policy = group.iloc[i - 1]
        
        # Compare net premium to determine renewal rate status
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Function to calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

# Apply churn status year-wise for each customer
df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Step 3: Add Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Step 4: Deduplicate yearly tenure by taking the maximum tenure for each year
df['Yearly Tenure (Months)'] = df.groupby(['CustomerID', 'Start Year'])['Policy Tenure Month'].transform('max')

# Step 5: Calculate cumulative tenure for each customer based on Yearly Tenure
cumulative_tenure = []
customer_groups = df.groupby('CustomerID')
for customer_id, group in customer_groups:
    cumulative_sum = 0
    last_year = None
    for _, row in group.iterrows():
        # Add tenure only if it's a new year
        if row['Start Year'] != last_year:
            cumulative_sum += row['Yearly Tenure (Months)']
            last_year = row['Start Year']
        cumulative_tenure.append(cumulative_sum)

df['Cumulative Tenure (Months)'] = cumulative_tenure

# Step 6: Calculate Tenure Decimal and Customer Tenure
df['Tenure Decimal'] = df['Cumulative Tenure (Months)'] / 12
df['Customer Tenure'] = df['Tenure Decimal'].round(2)

# Step 7: Identify year-wise new customers
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')

# Generate the year-wise new customer column
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)

# Add New Customers column
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 8: Calculate Renewal Rate Status
df['Renewal Rate Status'] = 'Null'  # Default
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    group = group.sort_values(by=['Policy Start Date'])  # Ensure sorted by start date
    
    for i in range(1, len(group)):  # Start from the second policy
        current_policy = group.iloc[i]
        previous_policy = group.iloc[i - 1]
        
        # Compare net premium to determine renewal rate status
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Step 9: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'Start Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Extract start year
df['Start Year'] = df['Policy Start Date'].dt.year

# Step 1: Group by CustomerID and Start Year to calculate min start date and max end date for each year
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Step 2: Calculate Yearly Tenure in Months
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 3: Merge Yearly Tenure back to the original dataframe
df = df.merge(yearly_tenure, on=['CustomerID', 'Start Year'], suffixes=('', '_Yearly'))

# Step 4: Assign cumulative tenure by calculating cumulative sum for each customer
df['Cumulative Tenure (Months)'] = df.groupby('CustomerID')['Yearly Tenure (Months)'].cumsum()

# Step 5: Calculate Tenure Decimal and Customer Tenure
df['Tenure Decimal'] = df['Cumulative Tenure (Months)'] / 12  # Convert months to years
df['Customer Tenure'] = df['Tenure Decimal'].round(2)  # Round tenure to 2 decimal places

# Step 7: Identify year-wise new customers
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')

# Generate the year-wise new customer column
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)

# Add New Customers column
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 8: Calculate Renewal Rate Status
df['Renewal Rate Status'] = 'Null'  # Default
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    group = group.sort_values(by=['Policy Start Date'])  # Ensure sorted by start date
    
    for i in range(1, len(group)):  # Start from the second policy
        current_policy = group.iloc[i]
        previous_policy = group.iloc[i - 1]
        
        # Compare net premium to determine renewal rate status
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Step 9: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'Start Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)


# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], format='%d/%m/%y', errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], format='%d/%m/%y', errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['Policy End Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date and max end date for each year
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Step 4: Calculate Yearly Tenure in Months
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 5: Merge Yearly Tenure back to the original dataframe
df = df.merge(yearly_tenure, on=['CustomerID', 'Start Year'], suffixes=('', '_Yearly'))

# Step 6: Assign cumulative tenure by calculating cumulative sum for each customer
df['Cumulative Tenure (Months)'] = df.groupby('CustomerID')['Yearly Tenure (Months)'].cumsum()

# Step 7: Calculate Tenure Decimal and Customer Tenure
df['Tenure Decimal'] = df['Cumulative Tenure (Months)'] / 12  # Convert months to years
df['Customer Tenure'] = df['Tenure Decimal'].round(2)  # Round tenure to 2 decimal places

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 9: Calculate Renewal Rate Status
df['Renewal Rate Status'] = 'Null'  # Default
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    group = group.sort_values(by=['Policy Start Date'])  # Ensure sorted by start date
    
    for i in range(1, len(group)):  # Start from the second policy
        current_policy = group.iloc[i]
        previous_policy = group.iloc[i - 1]
        
        # Compare net premium to determine renewal rate status
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF(final)'  # Target table name
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date and max end date for each year
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Step 4: Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 5: Merge Yearly Tenure back to the original dataframe
df = df.merge(yearly_tenure, on=['CustomerID', 'Start Year'], suffixes=('', '_Yearly'))

# Step 6: Assign cumulative tenure by calculating cumulative sum for each customer
df['Cumulative Tenure (Months)'] = df.groupby('CustomerID')['Yearly Tenure (Months)'].cumsum()

# Step 7: Calculate Tenure Decimal and Customer Tenure
df['Tenure Decimal'] = df['Cumulative Tenure (Months)'] / 12  # Convert months to years
df['Customer Tenure'] = df.groupby('CustomerID')['Tenure Decimal'].transform('max')  # Assign the max cumulative tenure per CustomerID

df['Customer Tenure'] = df['Customer Tenure'].round(0)

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF(final)'
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date and max end date for each year
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Step 4: Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 5: Merge Yearly Tenure back to the original dataframe
df = df.merge(yearly_tenure, on=['CustomerID', 'Start Year'], suffixes=('', '_Yearly'))

# Step 6: Assign cumulative tenure by calculating cumulative sum for each customer year by year
df['Cumulative Tenure (Months)'] = (
    df.groupby(['CustomerID', 'Start Year'])['Yearly Tenure (Months)']
    .transform('first')  # Assign Yearly Tenure to all rows in that year
    .cumsum()  # Cumulative sum across years
)

# Step 7: Calculate Tenure Decimal and Customer Tenure
df['Tenure Decimal'] = df['Cumulative Tenure (Months)'] / 12  # Convert months to years
df['Customer Tenure'] = df.groupby('CustomerID')['Tenure Decimal'].transform('max')  # Assign the max cumulative tenure per CustomerID

# Fix Customer Tenure for duplicate rows in the same year
df['Tenure Decimal'] = df.groupby(['CustomerID', 'Start Year'])['Tenure Decimal'].transform('first')
df['Customer Tenure'] = df.groupby(['CustomerID', 'Start Year'])['Customer Tenure'].transform('first')

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF(final)'
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(2)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF(final)'
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['Policy End Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 9: Calculate Renewal Rate Status
df['Renewal Rate Status'] = 'Null'  # Default
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    group = group.sort_values(by=['Policy Start Date'])  # Ensure sorted by start date
    
    for i in range(1, len(group)):  # Start from the second policy
        current_policy = group.iloc[i]
        previous_policy = group.iloc[i - 1]
        
        # Compare net premium to determine renewal rate status
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

In [1]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['Policy End Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Group by CustomerID and Policy No
df['Renewal Rate Status'] = 'Null'
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    # Ensure the group is sorted by Policy Start Date
    group = group.sort_values(by='Policy Start Date', ascending=True)

    # Debugging: Check the sorted group
    # print(f"\nCustomerID: {customer_id}")
    # print(group[['Policy No', 'Policy Start Date', 'Total Premium Payable']])

    # Iterate through the sorted group to compare premiums
    for i in range(1, len(group)):  # Start from the second policy
        current_policy = group.iloc[i]
        previous_policy = group.iloc[i - 1]

        # Debugging: Check the comparison
        # print(f"Comparing: {previous_policy['Policy Start Date']} (${previous_policy['Total Premium Payable']}) -> {current_policy['Policy Start Date']} (${current_policy['Total Premium Payable']})")

        # Compare Total Premium Payable values
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table namedf.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)
df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)

KeyboardInterrupt: 

In [3]:
a = "Enter"
b = "number"
c= a+b
print(c)

Enternumber


In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine, text

# Database connection setup
db_config = {
    'host': '####',
    'database': 'postgres',
    'user': 'postgres',
    'password': '#####',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM overall_cleaned_policy_level_data;"
df = pd.read_sql(query, con=engine)

# Preserve original data
original_df = df.copy()

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Generate CustomerID without modifying the original dataset
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime for mapping logic only
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Extract Start and End Years
df['Start Year'] = df['Policy Start Date'].dt.year
df['End Year'] = df['Policy End Date'].dt.year

# Step 1: Calculate Customer Tenure
# Group by CustomerID and Start Year for tenure calculation
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)'].cumsum()
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Map Customer Tenure to Original Data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Customer Tenure']]
df = original_df.merge(
    tenure_mapping,
    on=['CustomerID', 'Start Year'],
    how='left'
)

# Step 2: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Group by CustomerID and Policy No to calculate Renewal Rate Status
df['Renewal Rate Status'] = 'Null'
for customer_id, group in df.groupby(['CustomerID', 'Policy No']):
    # Ensure the group is sorted by Policy Start Date
    group = group.sort_values(by='Policy Start Date', ascending=True)

    # Iterate through the sorted group to compare premiums
    for i in range(1, len(group)):  # Start from the second policy
        previous_policy = group.iloc[i]
        current_policy = group.iloc[i - 1]

        # Compare Total Premium Payable values
        if current_policy['Total Premium Payable'] > previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Increase'
        elif current_policy['Total Premium Payable'] < previous_policy['Total Premium Payable']:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'Decrease'
        else:
            df.loc[current_policy.name, 'Renewal Rate Status'] = 'No Change'

# Step 3: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

# Save the final DataFrame into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'

# Save only the final data
with engine.connect() as connection:
    # Drop the table if it exists
    drop_query = f"DROP TABLE IF EXISTS {processed_table_name};"
    connection.execute(text(drop_query))  # Execute the drop statement
    print(f"Table {processed_table_name} dropped successfully.")

    # Load the new data into the table
    df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)
    print(f"Data loaded into {processed_table_name} successfully.")
