In [1]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data
# ---------------------------
query = "SELECT * FROM appended_base_and_pr;"
df = pd.read_sql(query, con=engine)

len(df)

# Create a DataFrame of column names and their types
#df_info = pd.DataFrame({'Column Name': df.columns, 'Data Type': df.dtypes.astype(str)})

# Display the DataFrame
#print(df_info)

# If you want to save it as a CSV file for easier viewing:
#df_info.to_csv("column_data_types.csv", index=False)

3230310

In [2]:
# Convert date columns to datetime using the names in your dataset
df['policy start date'] = pd.to_datetime(df['policy start date'], errors='coerce')
df['policy end date'] = pd.to_datetime(df['policy end date'], errors='coerce')
df = df.dropna(subset=['policy start date', 'policy end date'])

len(df)

3229882

In [None]:
# ---------------------------
# Step 3: Convert Date Columns Properly
# ---------------------------
# Remove leading/trailing spaces (sometimes SQL data has spaces)
df['policy start date'] = df['policy start date'].astype(str).str.strip()
df['policy end date'] = df['policy end date'].astype(str).str.strip()

# Convert using multiple possible formats
def parse_dates(date_col):
    return pd.to_datetime(date_col, errors='coerce', format='%d-%m-%Y')  # Adjust if necessary

df['policy start date'] = df['policy start date'].apply(parse_dates)
df['policy end date'] = df['policy end date'].apply(parse_dates)

# ---------------------------
# Step 4: Debug NaT Values (Check Invalid Dates)
# ---------------------------
print("Rows with NaT in 'policy start date':", df['policy start date'].isna().sum())
print("Rows with NaT in 'policy end date':", df['policy end date'].isna().sum())




Rows with NaT in 'policy start date': 2759256
Rows with NaT in 'policy end date': 2759256


In [5]:
# Select only 'policy start date' and 'policy end date' where values are NaT
missing_dates_df = df[df['policy start date'].isna() | df['policy end date'].isna()][['policy start date', 'policy end date']]

# Display the result
print(missing_dates_df)

        policy start date policy end date
665                   NaT             NaT
666                   NaT             NaT
669                   NaT             NaT
671                   NaT             NaT
672                   NaT             NaT
...                   ...             ...
3230305               NaT             NaT
3230306               NaT             NaT
3230307               NaT             NaT
3230308               NaT             NaT
3230309               NaT             NaT

[2759256 rows x 2 columns]


In [None]:
# Normalize dates: Convert using `dayfirst=True` to handle both `-` and `/` separators
df['policy start date'] = pd.to_datetime(df['policy start date'], dayfirst=True, errors='coerce')
df['policy end date'] = pd.to_datetime(df['policy end date'], dayfirst=True, errors='coerce')

df = df.dropna(subset=['policy start date', 'policy end date'])

#df = df[~df[['policy start date', 'policy end date']].isnull().all(axis=1)]

len(df)

In [None]:
# ---------------------------
# Step 2: Filter Premium Values
# ---------------------------
# Use the column "total premium payable" as provided in your column list
df['total premium payable'] = pd.to_numeric(df['total premium payable'].astype(str).str.strip(), errors='coerce')
df = df[df['total premium payable'].notnull() & (df['total premium payable'] > 0)]

# ---------------------------
# Step 3: Calculate Policy Tenure & Filter
# ---------------------------
def calculate_tenure_exact(start_date, end_date):
    diff = relativedelta(end_date, start_date)
    return diff.years * 12 + diff.months + (diff.days >= 0)

df['Policy Tenure(check)'] = df.apply(lambda row: calculate_tenure_exact(row['policy start date'], row['policy end date']), axis=1)
df = df[df['Policy Tenure(check)'] > 10]

# ---------------------------
# Step 4: Handle Duplicates and Prioritize
# ---------------------------
def prioritize_rows(group):
    # Count null values in each row to help with prioritization
    group['null_count'] = group.isnull().sum(axis=1)
    group = group.sort_values(by=['null_count', 'booked', 'policy start date'], ascending=[True, False, True])
    return group.iloc[0]

# Identify duplicate rows based on 'policy no', 'policy start date', and 'policy end date'
duplicates = df[df.duplicated(subset=['policy no', 'policy start date', 'policy end date'], keep=False)]
cleaned_duplicates = (
    duplicates.groupby(['policy no', 'policy start date', 'policy end date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)
df_cleaned = df.drop_duplicates(subset=['policy no', 'policy start date', 'policy end date'], keep=False)
df_cleaned = pd.concat([df_cleaned, cleaned_duplicates], ignore_index=True)

len(df_cleaned)

In [None]:
# ---------------------------
# Step 5: Handle Duplicates based on 'Trim Policy No', 'policy start date', and 'policy end date'
# ---------------------------
def prioritize_trim_group(group):
    base_values = ['2022_base', '2023_base', '2024_base']
    base_rows = group[group['data'].isin(base_values)]
    if not base_rows.empty:
        selected = base_rows.sort_values(by='total premium payable', ascending=False).iloc[0]
    else:
        selected = group.sort_values(by=['policy issue date', 'total premium payable'], ascending=[False, False]).iloc[0]
    return selected
def assign_trim_group(group):
    if len(group) > 1:
        selected_row = prioritize_trim_group(group)
    else:
        selected_row = group.iloc[0]
    return selected_row
df_final = df_cleaned.groupby(['Trim Policy No', 'policy start date', 'policy end date'], group_keys=False).apply(assign_trim_group).reset_index(drop=True)

len(df_final)

In [None]:
import re

# ---------------------------
# Step 6: Clean Specified Name Columns
# ---------------------------
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()
columns_to_clean = {
    "insured name": "Cleaned insured name",
    "new branch name 2": "Cleaned Branch Name 2",
    "state2": "Cleaned State2",
    "zone 2": "Cleaned Zone 2",
    "chassis number": "Cleaned Chassis Number",
    "enginenumber": "Cleaned Engine Number",
    "reg no": "Cleaned Reg no"
}
for orig_col, new_col in columns_to_clean.items():
    if orig_col in df_final.columns:
        df_final[new_col] = df_final[orig_col].apply(clean_name)

In [None]:
# ---------------------------
# Final DataFrame ready for analysis or export
# ---------------------------
df_final.to_sql('cleaned_appended_base_and_pr', con=engine, if_exists='replace', index=False)

In [None]:
# ---------------------------
# Step 6: Update 'final old policy no' for Groups (Duplicates and Non-Duplicates)
#         (Only update rows where 'old policy no' is null)
# ---------------------------
def select_policy_no_for_duplicates(group):
    base_values = ['2022_base', '2023_base', '2024_base']
    base_rows = group[group['data'].isin(base_values)]
    if not base_rows.empty:
        selected = base_rows.sort_values(by='total premium payable', ascending=False).iloc[0]
    else:
        selected = group.sort_values(by='total premium payable', ascending=False).iloc[0]
    return selected['policy no']

def assign_final_old_policy_no(group):
    # Compute the replacement value for rows with a null 'old policy no'
    if len(group) > 1:
        computed_policy_no = select_policy_no_for_duplicates(group)
    else:
        computed_policy_no = group.iloc[0]['policy no']
    # For each row, if 'old policy no' is null, assign the computed value;
    # otherwise, keep the existing 'old policy no'.
    group['final old policy no'] = group.apply(
        lambda row: row['old policy no'] if pd.notnull(row['old policy no']) else computed_policy_no,
        axis=1
    )
    return group

# Group by 'Trim Policy No' and 'policy start date' (duplicate groups)
df_final = df_cleaned.groupby(['Trim Policy No', 'policy start date'], group_keys=False).apply(assign_final_old_policy_no)

In [None]:
# ---------------------------
# Final DataFrame ready for analysis or export
# ---------------------------
df_final.to_sql('cleaned_appended_base_and_pr1', con=engine, if_exists='replace', index=False)