In [1]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = 'SELECT * FROM public."overall_cleaned_policy_level_data(with prem)";'
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name'].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['Policy End Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

from sqlalchemy import text

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name

# Create a connection to the database
with engine.connect() as connection:
    # Drop the table if it exists
    drop_query = f"DROP TABLE IF EXISTS {processed_table_name};"
    connection.execute(text(drop_query))  # Execute the drop statement
    print(f"Table {processed_table_name} dropped successfully.")

    # Load the new data into the table
    df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)
    print(f"Data loaded into {processed_table_name} successfully.")

Table overall_policy_level_data_EF dropped successfully.
Data loaded into overall_policy_level_data_EF successfully.
