In [None]:
from datetime import datetime, timedelta
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM backup_overall_merged_data_31_12;"
df = pd.read_sql(query, con=engine)

# Convert date columns to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 2: Remove invalid rows in 'Total Premium Payable'
df['Total Premium Payable '] = pd.to_numeric(df['Total Premium Payable '].astype(str).str.strip(), errors='coerce')
df = df[df['Total Premium Payable '].notnull() & (df['Total Premium Payable '] != 0)]

# Step 3: Remove policies with tenure less than or equal to 10 months
df['Policy Tenure'] = (df['Policy End Date'] - df['Policy Start Date']).dt.days / 30
df = df[df['Policy Tenure'] > 10]

# Step 4: Handle duplicates and prioritize
def prioritize_rows(group):
    group['null_count'] = group.isnull().sum(axis=1)
    group = group.sort_values(by=['null_count', 'booked', 'Policy Start Date'], ascending=[True, False, True])
    return group.iloc[0]

duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]
cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

# Remove duplicates and append cleaned duplicates
df_cleaned = df.drop_duplicates(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)
df_cleaned = pd.concat([df_cleaned, cleaned_duplicates], ignore_index=True)

# Final DataFrame ready for analysis or export
df_cleaned.to_sql('cleaned_overall_merged_data_wod', con=engine, if_exists='replace', index=False)

In [None]:
from datetime import datetime, timedelta
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM backup_overall_merged_data_31_12;"
df = pd.read_sql(query, con=engine)

# Convert date columns to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 2: Remove invalid rows in 'Total Premium Payable'
df['Total Premium Payable '] = pd.to_numeric(df['Total Premium Payable '].astype(str).str.strip(), errors='coerce')
df = df[df['Total Premium Payable '].notnull() & (df['Total Premium Payable '] != 0)]

# Step 3: Calculate precise policy tenure in months and filter policies > 10 months
def calculate_tenure(start_date, end_date):
    return (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month) + (end_date.day >= start_date.day)

df['Policy Tenure'] = df.apply(lambda row: calculate_tenure(row['Policy Start Date'], row['Policy End Date']), axis=1)
df = df[df['Policy Tenure'] > 10]

# Step 4: Handle duplicates and prioritize
def prioritize_rows(group):
    group['null_count'] = group.isnull().sum(axis=1)
    group = group.sort_values(by=['null_count', 'booked', 'Policy Start Date'], ascending=[True, False, True])
    return group.iloc[0]

duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]
cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

# Remove duplicates and append cleaned duplicates
df_cleaned = df.drop_duplicates(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)
df_cleaned = pd.concat([df_cleaned, cleaned_duplicates], ignore_index=True)

# Final DataFrame ready for analysis or export
df_cleaned.to_sql('cleaned_overall_merged_data_wod', con=engine, if_exists='replace', index=False)

In [None]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM backup_overall_merged_data_31_12;"
df = pd.read_sql(query, con=engine)

# Convert date columns to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Step 2: Remove invalid rows in 'Total Premium Payable'
df['Total Premium Payable '] = pd.to_numeric(df['Total Premium Payable '].astype(str).str.strip(), errors='coerce')
df = df[df['Total Premium Payable '].notnull() & (df['Total Premium Payable '] != 0)]

# Step 3: Calculate precise policy tenure in months and filter policies > 10 months
def calculate_tenure_exact(start_date, end_date):
    diff = relativedelta(end_date, start_date)
    return diff.years * 12 + diff.months + (diff.days >= 0)

df['Policy Tenure'] = df.apply(lambda row: calculate_tenure_exact(row['Policy Start Date'], row['Policy End Date']), axis=1)
df = df[df['Policy Tenure'] > 10]

# Step 4: Handle duplicates and prioritize
def prioritize_rows(group):
    group['null_count'] = group.isnull().sum(axis=1)
    group = group.sort_values(by=['null_count', 'booked', 'Policy Start Date'], ascending=[True, False, True])
    return group.iloc[0]

duplicates = df[df.duplicated(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)]
cleaned_duplicates = (
    duplicates.groupby(['Policy No', 'Policy Start Date', 'Policy End Date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)

# Remove duplicates and append cleaned duplicates
df_cleaned = df.drop_duplicates(subset=['Policy No', 'Policy Start Date', 'Policy End Date'], keep=False)
df_cleaned = pd.concat([df_cleaned, cleaned_duplicates], ignore_index=True)

# Final DataFrame ready for analysis or export
df_cleaned.to_sql('cleaned_overall_merged_data_wod', con=engine, if_exists='replace', index=False)

In [None]:
from datetime import datetime, timedelta
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM cleaned_overall_merged_data_wod;"
df = pd.read_sql(query, con=engine)

# Convert date columns to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Initialize 'booked' column
if 'booked' not in df.columns:
    df['booked'] = None
df['booked'] = df['booked'].fillna('').astype(str)

today = pd.Timestamp.now().normalize()

def update_booked(group):
    # Sort policies by start date within the group
    group = group.sort_values(by='Policy Start Date')

    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Only process rows where booked is empty
        if current_policy['booked'] in ['', 'None']:
            # Check if the next policy starts the day after the current one ends
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'

    # Handle the last policy for the customer
    last_policy = group.iloc[-1]
    if last_policy['booked'] in ['', 'None']:
        if last_policy['Policy End Date'] >= today:
            group.loc[last_policy.name, 'booked'] = '-'
        else:
            group.loc[last_policy.name, 'booked'] = '0'

    return group

# Apply the function to each group of policies
df = df.groupby('Policy No').apply(update_booked).reset_index(drop=True)

def correct_booked(group):
    group = group.sort_values(by='Policy Start Date')
    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Check if the current policy is marked as '0' but satisfies the condition
        if current_policy['booked'] == '0':
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1'

    return group

df_cleaned = df.groupby('Policy No').apply(correct_booked).reset_index(drop=True)

# Final DataFrame ready for analysis or export
df_cleaned.to_sql('cleaned_overall_merged_base_pr_data', con=engine, if_exists='replace', index=False)

In [None]:
from datetime import datetime, timedelta
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM cleaned_overall_merged_data_wod;"
df = pd.read_sql(query, con=engine)

# Convert date columns to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Initialize 'booked' column
if 'booked' not in df.columns:
    df['booked'] = None
df['booked'] = df['booked'].fillna('').astype(str)  # Ensure 'booked' is a string at the start

today = pd.Timestamp.now().normalize()

def update_booked(group):
    # Sort policies by start date within the group
    group = group.sort_values(by='Policy Start Date')

    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Only process rows where booked is empty
        if current_policy['booked'] in ['', 'None']:
            # Check if the next policy starts the day after the current one ends
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'

    # Handle the last policy for the customer
    last_policy = group.iloc[-1]
    if last_policy['booked'] in ['', 'None']:
        group.loc[last_policy.name, 'booked'] = '-' if last_policy['Policy End Date'] >= today else '0.0'

    return group

# Apply the function to each group of policies
df = df.groupby('Policy No').apply(update_booked).reset_index(drop=True)

def correct_booked(group):
    group = group.sort_values(by='Policy Start Date')
    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Check if the current policy is marked as '0' but satisfies the condition
        if current_policy['booked'] == '0.0':
            # Check if the next policy starts immediately after the current one
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'

    # Handle the last policy in the group
    last_policy = group.iloc[-1]
    if last_policy['booked'] == '0.0':
        # If the last policy is still active, mark as '-'
        if last_policy['Policy End Date'] >= pd.Timestamp.now().normalize():
            group.loc[last_policy.name, 'booked'] = '-'

    return group

# Apply the correction function
df_cleaned = df.groupby('Policy No').apply(correct_booked).reset_index(drop=True)

# Ensure 'booked' is consistently a string after processing
df_cleaned['booked'] = df_cleaned['booked'].astype(str)

# Final DataFrame ready for analysis or export
df_cleaned.to_sql('cleaned_overall_merged_base_pr_data', con=engine, if_exists='replace', index=False)

In [None]:
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM cleaned_overall_merged_data_wod;"
df = pd.read_sql(query, con=engine)

# Convert date columns to datetime
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'])

# Sort data by Policy No and Policy Start Date
df = df.sort_values(by=['Policy No', 'Policy Start Date']).reset_index(drop=True)

# Today's date
today = datetime.now().date()

# Step 2: Process nulls in Booked column
for i in range(len(df)):
    if pd.isnull(df.loc[i, 'booked']):
        policy_no = df.loc[i, 'Policy No']
        policy_end_date = df.loc[i, 'Policy End Date'].date()

        # Get all rows with the same Policy No
        same_policy = df[df['Policy No'] == policy_no]

        # Check if there is a next row for the same policy
        current_row_index = same_policy.index.get_loc(i)
        if current_row_index < len(same_policy) - 1:
            next_row = same_policy.iloc[current_row_index + 1]
            next_policy_start_date = next_row['Policy Start Date'].date()

            # Compare dates
            if next_policy_start_date > policy_end_date:
                df.loc[i, 'booked'] = 1.0
            else:
                df.loc[i, 'booked'] = ''  # Use empty string instead of -1.0
        else:
            # No next policy
            if policy_end_date <= today:
                df.loc[i, 'booked'] = 0.0
            else:
                df.loc[i, 'booked'] = ''

# Step 3: Additional processing for Booked = 0
for i in range(len(df)):
    if df.loc[i, 'booked'] == 0.0:
        policy_no = df.loc[i, 'Policy No']
        policy_end_date = df.loc[i, 'Policy End Date'].date()

        # Get all rows with the same Policy No
        same_policy = df[df['Policy No'] == policy_no]

        # Check if there is a next row for the same policy
        current_row_index = same_policy.index.get_loc(i)
        if current_row_index < len(same_policy) - 1:
            next_row = same_policy.iloc[current_row_index + 1]
            next_policy_start_date = next_row['Policy Start Date'].date()

            # Compare dates
            if next_policy_start_date > policy_end_date:
                df.loc[i, 'booked'] = 1.0

# Step 4: Export the cleaned DataFrame
df.to_sql('cleaned_overall_merged_base_pr_data', con=engine, if_exists='replace', index=False)

In [1]:
from datetime import datetime, timedelta
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
db_config = {
    'host': '10.10.10.71',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# Step 1: Load data
query = "SELECT * FROM cleaned_overall_merged_data_wod;"
df = pd.read_sql(query, con=engine)

# Convert date columns to datetime format
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy End Date'] = pd.to_datetime(df['Policy End Date'], errors='coerce')

# Initialize 'booked' column
if 'booked' not in df.columns:
    df['booked'] = None
df['booked'] = df['booked'].fillna('').astype(str)

today = pd.Timestamp.now().normalize()

def update_booked(group):
    # Sort policies by start date within the group
    group = group.sort_values(by='Policy Start Date')

    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Only process rows where booked is empty
        if current_policy['booked'] in ['', 'None']:
            # Check if the next policy starts the day after the current one ends
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'

    # Handle the last policy for the customer
    last_policy = group.iloc[-1]
    if last_policy['booked'] in ['', 'None']:
        if last_policy['Policy End Date'] >= today:
            group.loc[last_policy.name, 'booked'] = '-'
        else:
            group.loc[last_policy.name, 'booked'] = '0.0'

    return group

# Apply the function to each group of policies
df = df.groupby('Policy No').apply(update_booked).reset_index(drop=True)

def correct_booked(group):
    group = group.sort_values(by='Policy Start Date')
    for i in range(len(group) - 1):
        current_policy = group.iloc[i]
        next_policy = group.iloc[i + 1]

        # Check if the current policy is marked as '0' but satisfies the condition
        if current_policy['booked'] == '0.0':
            if next_policy['Policy Start Date'] == current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'
            elif next_policy['Policy Start Date'] > current_policy['Policy End Date'] + timedelta(days=1):
                group.loc[current_policy.name, 'booked'] = '1.0'

    return group

df_cleaned = df.groupby('Policy No').apply(correct_booked).reset_index(drop=True)

# Final DataFrame ready for analysis or export
df_cleaned.to_sql('cleaned_overall_merged_base_pr_data', con=engine, if_exists='replace', index=False)

  df = df.groupby('Policy No').apply(update_booked).reset_index(drop=True)
  df_cleaned = df.groupby('Policy No').apply(correct_booked).reset_index(drop=True)


152