In [None]:
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM test_data_12_12;"
df = pd.read_sql(query, con=engine)

df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 3: Deduplicate and prioritize
duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]

def prioritize_rows(group):
    group = group.assign(null_count=group.isnull().sum(axis=1))
    group = group.sort_values(by=['null_count', 'booked'], ascending=[True, False])
    return group.iloc[0]

cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

df_cleaned = pd.concat([df, cleaned_duplicates]).drop_duplicates(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep='last')

df_cleaned['Policy End Date'] = pd.to_datetime(df_cleaned['Policy End Date'], errors='coerce')

# Step 4: Handle NULL values in BOOKED
today = pd.Timestamp(datetime.now().date())  # Ensure today is a pandas.Timestamp
df_cleaned['booked'] = df_cleaned.apply(
    lambda row: (
        0 if pd.isnull(row['booked']) and pd.notnull(row['Policy End Date']) and row['Policy End Date'] < today 
        else '-' if pd.isnull(row['booked']) else row['booked']
    ),
    axis=1
)

# Step 5: Correct BOOKED values based on Type
correction_count = 0

def correct_booked(group):
    global correction_count
    type_a = group[group['Type'] == 'A']
    type_b = group[group['Type'] == 'B']
    
    # Check if both Type A and Type B exist for the same Policy No
    if not type_a.empty and not type_b.empty:
        # Check if BOOKED is 0 for Type A
        if (type_a.iloc[0]['booked'] == 0):
            correction_count += 1
            # Update BOOKED to 1 for Type A
            group.loc[group['Type'] == 'A', 'booked'] = 1
    return group

# Apply the correction function to ensure Type A rows are properly updated
type_a_b = df_cleaned[df_cleaned['Type'].isin(['A', 'B'])]
type_a_b_grouped = type_a_b.groupby('Policy No')

df_cleaned = type_a_b_grouped.apply(correct_booked).reset_index(drop=True)

# Output the number of corrections made
print(f"Number of corrections made: {correction_count}")

# Optional Step: Save the cleaned dataset back to the database
df_cleaned.to_sql('cleaned_test_data_12_12', con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM public.cleaned_test_data_12_12;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name '].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['Policy End Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

from sqlalchemy import text

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name

# Create a connection to the database
with engine.connect() as connection:
    # Drop the table if it exists
    drop_query = f"DROP TABLE IF EXISTS {processed_table_name};"
    connection.execute(text(drop_query))  # Execute the drop statement
    print(f"Table {processed_table_name} dropped successfully.")

    # Load the new data into the table
    df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)
    print(f"Data loaded into {processed_table_name} successfully.")

In [1]:
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"

engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM test_data_12_12;"
df = pd.read_sql(query, con=engine)

# Step 2: Parse datetime columns (simplified)
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 3: Deduplicate and prioritize
duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]

def prioritize_rows(group):
    group = group.assign(null_count=group.isnull().sum(axis=1))
    group = group.sort_values(by=['null_count', 'booked'], ascending=[True, False])
    return group.iloc[0]

cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

df_cleaned = pd.concat([df, cleaned_duplicates]).drop_duplicates(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep='last')

# Step 4: Handle NULL values in BOOKED
today = pd.Timestamp(datetime.now().date())

df_cleaned['booked'] = df_cleaned['booked'].fillna(
    df_cleaned['Policy End Date'].apply(
        lambda x: 0 if pd.notnull(x) and x < today else '-'
    )
)

# Step 5: Correct BOOKED values based on Type
correction_count = 0

def correct_booked(group):
    global correction_count
    type_a = group[group['Type'] == 'A']
    type_b = group[group['Type'] == 'B']
    
    if not type_a.empty and not type_b.empty:
        if (type_a.iloc[0]['booked'] == 0):
            correction_count += 1
            group.loc[group['Type'] == 'A', 'booked'] = 1
    return group

type_a_b = df_cleaned[df_cleaned['Type'].isin(['A', 'B'])]
type_a_b_grouped = type_a_b.groupby('Policy No')

df_cleaned = type_a_b_grouped.apply(correct_booked).reset_index(drop=True)

# Output the number of corrections made
print(f"Number of corrections made: {correction_count}")

# Save the cleaned dataset back to the database
df_cleaned.to_sql('cleaned_test_data_12_12', con=engine, if_exists='replace', index=False)

  .apply(prioritize_rows)
  df_cleaned = type_a_b_grouped.apply(correct_booked).reset_index(drop=True)


Number of corrections made: 49


332

In [2]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data from PostgreSQL
query = "SELECT * FROM public.cleaned_test_data_12_12;"
df = pd.read_sql(query, con=engine)

# Function to clean names
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Clean and generate CustomerID
df['Cleaned_Insured name'] = df['Insured name '].apply(clean_name)
df['CustomerID_Base'] = (df['Cleaned_Insured name'].astype(str) + '_' +
                         df['New Branch Name  2'].astype(str))
df['CustomerID'] = (df.groupby('CustomerID_Base').ngroup() + 1000001).astype(str)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Map BOOKED values to Policy Status
df['booked'] = df['booked'].astype(str).str.strip()  # Ensure values are strings and remove extra spaces
df['booked'] = df['booked'].replace({'0.0': '0', '1.0': '1'})  # Normalize float-like strings to integers

policy_status_map = {'0': 'Not Renewed', '1': 'Renewed', '-': 'Open'}
df['Policy Status'] = df['booked'].map(policy_status_map)

# Step 2: Calculate Policy Tenure (Months) for each individual policy
df['Policy Tenure Month'] = ((df['Policy End Date'].dt.year - df['Policy Start Date'].dt.year) * 12 +
                             (df['Policy End Date'].dt.month - df['Policy Start Date'].dt.month))

# Calculate policy tenure in years (rounded)
df['Policy Tenure'] = (df['Policy Tenure Month'] / 12).round(0)

# Step 2: Extract Start Year for grouping
df['Start Year'] = df['Policy Start Date'].dt.year

# Extract the year from start and end dates for other calculations
df['End Year'] = df['Policy End Date'].dt.year

# Step 3: Group by CustomerID and Start Year to calculate min start date, max end date, and yearly tenure
yearly_tenure = (
    df.groupby(['CustomerID', 'Start Year'])
    .agg({'Policy Start Date': 'min', 'Policy End Date': 'max'})
    .reset_index()
)

# Calculate Yearly Tenure (Months)
yearly_tenure['Yearly Tenure (Months)'] = (
    (yearly_tenure['Policy End Date'].dt.year - yearly_tenure['Policy Start Date'].dt.year) * 12 +
    (yearly_tenure['Policy End Date'].dt.month - yearly_tenure['Policy Start Date'].dt.month)
)

# Step 4: Calculate Cumulative Tenure
yearly_tenure['Cumulative Tenure (Months)'] = (
    yearly_tenure.groupby('CustomerID')['Yearly Tenure (Months)']
    .cumsum()
)

# Convert Cumulative Tenure to years and calculate Customer Tenure
yearly_tenure['Tenure Decimal'] = yearly_tenure['Cumulative Tenure (Months)'] / 12
yearly_tenure['Customer Tenure'] = yearly_tenure['Tenure Decimal'].round(0)

# Select relevant columns for mapping back to original data
tenure_mapping = yearly_tenure[['CustomerID', 'Start Year', 'Cumulative Tenure (Months)', 'Tenure Decimal', 'Customer Tenure']]

# Step 5: Map back to the original data
df = df.merge(tenure_mapping, on=['CustomerID', 'Start Year'], how='left')

# Step 8: Add New Customers column
df['FirstPolicyYear'] = df.groupby('CustomerID')['Start Year'].transform('min')
df['New_Customer_ID'] = df.apply(
    lambda row: f"{row['FirstPolicyYear']}_{row['CustomerID']}" if row['Start Year'] == row['FirstPolicyYear'] else '',
    axis=1
)
df['New Customers'] = df['New_Customer_ID'].apply(lambda x: 'Yes' if x else 'No')

# Step 10: Calculate year-wise churn status
def calculate_churn_status(group):
    unique_statuses = group.unique()
    if len(unique_statuses) == 1 and unique_statuses[0] == 'Not Renewed':
        return 'Yes'
    else:
        return 'No'

df['Churn Label'] = df.groupby(['CustomerID', 'End Year'])['Policy Status'].transform(lambda x: calculate_churn_status(x))

from sqlalchemy import text

# Save the processed data into PostgreSQL
processed_table_name = 'overall_policy_level_data_EF'  # Target table name

# Create a connection to the database
with engine.connect() as connection:
    # Drop the table if it exists
    drop_query = f"DROP TABLE IF EXISTS {processed_table_name};"
    connection.execute(text(drop_query))  # Execute the drop statement
    print(f"Table {processed_table_name} dropped successfully.")

    # Load the new data into the table
    df.to_sql(processed_table_name, con=engine, if_exists='replace', index=False)
    print(f"Data loaded into {processed_table_name} successfully.")

Table overall_policy_level_data_EF dropped successfully.
Data loaded into overall_policy_level_data_EF successfully.
