In [1]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data
# ---------------------------
query = "SELECT * FROM appended_base_and_pr;"
df = pd.read_sql(query, con=engine)

In [2]:
# Convert date columns to datetime using the names in your dataset
df['policy start date'] = pd.to_datetime(df['policy start date'], errors='coerce')
df['policy end date'] = pd.to_datetime(df['policy end date'], errors='coerce')
df = df.dropna(subset=['policy start date', 'policy end date'])

# ---------------------------
# Step 2: Filter Premium Values
# ---------------------------
# Use the column "total premium payable" as provided in your column list
df['total premium payable'] = pd.to_numeric(df['total premium payable'].astype(str).str.strip(), errors='coerce')
df = df[df['total premium payable'].notnull() & (df['total premium payable'] > 0.01)]

# ---------------------------
# Step 3: Calculate Policy Tenure & Filter
# ---------------------------
def calculate_tenure_exact(start_date, end_date):
    diff = relativedelta(end_date, start_date)
    return diff.years * 12 + diff.months + (diff.days >= 0)

df['Policy Tenure(check)'] = df.apply(lambda row: calculate_tenure_exact(row['policy start date'], row['policy end date']), axis=1)
df = df[df['Policy Tenure(check)'] > 10]

len(df)

3169490

In [3]:
# ---------------------------
# Step 4: Handle Duplicates and Prioritize
# ---------------------------
def prioritize_rows(group):
    # Count null values in each row to help with prioritization
    group['null_count'] = group.isnull().sum(axis=1)
    group = group.sort_values(by=['null_count', 'booked', 'policy start date'], ascending=[True, False, True])
    return group.iloc[0]

# Identify duplicate rows based on 'policy no', 'policy start date', and 'policy end date'
duplicates = df[df.duplicated(subset=['policy no', 'policy start date', 'policy end date'], keep=False)]
cleaned_duplicates = (
    duplicates.groupby(['policy no', 'policy start date', 'policy end date'])
    .apply(prioritize_rows)
    .reset_index(drop=True)
)
df_cleaned = df.drop_duplicates(subset=['policy no', 'policy start date', 'policy end date'], keep=False)
df_cleaned = pd.concat([df_cleaned, cleaned_duplicates], ignore_index=True)

len(df_cleaned)

  .apply(prioritize_rows)


2336588

In [5]:
import re

# ---------------------------
# Step 6: Clean Specified Name Columns
# ---------------------------
def clean_name(name):
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()
columns_to_clean = {
    "insured name": "Cleaned insured name",
    "new branch name 2": "Cleaned Branch Name 2",
    "state2": "Cleaned State2",
    "zone 2": "Cleaned Zone 2",
    "chassis number": "Cleaned Chassis Number",
    "enginenumber": "Cleaned Engine Number",
    "reg no": "Cleaned Reg no"
}
for orig_col, new_col in columns_to_clean.items():
    if orig_col in df_cleaned.columns:
        df_cleaned[new_col] = df_cleaned[orig_col].apply(clean_name)

In [6]:
# ---------------------------
# Final DataFrame ready for analysis or export
# ---------------------------
df_cleaned.to_sql('basiccleaned_appended_base_and_pr', con=engine, if_exists='replace', index=False)

140