In [None]:
from datetime import datetime, timedelta
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql import text

# Database connection setup
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM backup_overall_merged_data_31_12;"
df = pd.read_sql(query, con=engine)

# Convert dates to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 2: Remove invalid rows in 'Total Premium Payable'
df['Total Premium Payable '] = pd.to_numeric(df['Total Premium Payable '].astype(str).str.strip(), errors='coerce')
df = df[df['Total Premium Payable '].notnull() & (df['Total Premium Payable '] != 0)]

# Initialize 'booked' column
if 'booked' not in df.columns:
    df['booked'] = None
# Convert 'booked' to string for consistency
df['booked'] = df['booked'].fillna('').astype(str)

today = pd.Timestamp.now().normalize()

def update_booked(group):
    # Sort policies by start date within the group
    group = group.sort_values(by='Policy Start Date')

    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Check if booked is empty or NULL-like
        if current_policy['booked'] in ['', 'None']:
            # Check if the next policy starts the day after the current one ends
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'

    # Handle the last policy for the customer
    last_policy = group.iloc[-1]
    if last_policy['booked'] in ['', 'None']:
        if last_policy['Policy End Date'] >= today:
            group.loc[last_policy.name, 'booked'] = '-'
        else:
            group.loc[last_policy.name, 'booked'] = '0'

    # Handle single-policy groups where there is no next_policy
    if len(group) == 1:
        single_policy = group.iloc[0]
        if single_policy['booked'] in ['', 'None']:
            if single_policy['Policy End Date'] >= today:
                group.loc[single_policy.name, 'booked'] = '-'
            else:
                group.loc[single_policy.name, 'booked'] = '0'

    return group

# Apply the function to each group of policies
df = df.groupby('Policy No').apply(update_booked).reset_index(drop=True)

# Step 4: Handle duplicates and prioritize
def prioritize_rows(group):
    group = group.assign(null_count=group.isnull().sum(axis=1))
    group = group.sort_values(by=['null_count', 'booked', 'Policy Start Date'], ascending=[True, False, True])
    return group.iloc[0]

duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]
cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

df_cleaned = df.drop_duplicates(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)
df_cleaned = pd.concat([df_cleaned, cleaned_duplicates], ignore_index=True)

def correct_booked(group):
    group = group.sort_values(by='Policy Start Date')
    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Check if the current policy is marked as '0' but satisfies the condition
        if current_policy['booked'] == '0':
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'

    return group

# Apply the correction to each group of policies
df_cleaned = df_cleaned.groupby('Policy No').apply(correct_booked).reset_index(drop=True)

In [None]:
df_cleaned.to_sql(
    'cleaned_overall_merged_base_pr_datacheck',
    con=engine,
    if_exists='replace',
    index=False,
    chunksize=100000  # Adjust chunk size based on performance
)