In [1]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data
# ---------------------------
query = "SELECT * FROM cleanchassisengine_basiccleaned_appended_base_and_pr;"
df = pd.read_sql(query, con=engine)

In [2]:
import pandas as pd

# Define a helper function to determine if a value is valid (not null, empty, or 'blank')
def is_valid_value(val):
    if pd.isna(val):
        return False
    val_str = str(val).strip().lower()
    return val_str != "" and val_str != "blank"

# Create mask for rows with valid chassis and engine numbers in both columns
mask_valid = (
    df['Cleaned Chassis Number'].apply(is_valid_value) &
    df['Cleaned Engine Number'].apply(is_valid_value)
)

# Filter out rows that do not have valid values in both columns
df_valid = df[mask_valid].copy()

# Convert date columns to datetime
df_valid['policy start date'] = pd.to_datetime(df_valid['policy start date'], errors='coerce')
df_valid['policy end date'] = pd.to_datetime(df_valid['policy end date'], errors='coerce')

# ---------------------------
# Step 5: Handle Duplicates based on grouping columns
# ---------------------------
def prioritize_trim_group(group):
    base_values = ['2022_base', '2023_base', '2024_base']
    base_rows = group[group['data'].isin(base_values)]
    if not base_rows.empty:
        # Choose the record with the highest total premium payable among base records
        selected = base_rows.sort_values(by='total premium payable', ascending=False).iloc[0]
    else:
        # Otherwise choose the record with the latest policy issue date, then highest total premium payable
        selected = group.sort_values(by=['policy issue date', 'total premium payable'], ascending=[False, False]).iloc[0]
    return selected

def assign_trim_group(group):
    if len(group) > 1:
        selected_row = prioritize_trim_group(group)
    else:
        selected_row = group.iloc[0]
    return selected_row

# Group by the relevant columns and apply duplicate handling
df_final = (
    df_valid
    .groupby(['Cleaned Chassis Number', 'Cleaned Engine Number', 'policy start date', 'policy end date'], group_keys=False)
    .apply(assign_trim_group)
    .reset_index(drop=True)
)

print(len(df_final))

  .apply(assign_trim_group)


2086538


In [3]:
# ---------------------------
# Final DataFrame ready for analysis or export
# ---------------------------
df_final.to_sql('dupclean_cleanchassisengine_basiccleaned_appended_base_and_pr', con=engine, if_exists='replace', index=False)

178

In [4]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from sqlalchemy import create_engine

# ---------------------------
# Step 0: Database Connection
# ---------------------------
db_config = {
    'host': 'localhost',
    'database': 'postgres',
    'user': 'postgres',
    'password': 'kaviyam123',
    'port': '5432'
}
connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
engine = create_engine(connection_string)

# ---------------------------
# Step 1: Load Data
# ---------------------------
query = "SELECT * FROM cleanchassisengine_samechassisno_differregno;"
df = pd.read_sql(query, con=engine)

In [5]:
import pandas as pd

# Define a helper function to determine if a value is valid (not null, empty, or 'blank')
def is_valid_value(val):
    if pd.isna(val):
        return False
    val_str = str(val).strip().lower()
    return val_str != "" and val_str != "blank"

# Create mask for rows with valid chassis and engine numbers in both columns
mask_valid = (
    df['Cleaned Chassis Number'].apply(is_valid_value) &
    df['Cleaned Engine Number'].apply(is_valid_value)
)

# Filter out rows that do not have valid values in both columns
df_valid = df[mask_valid].copy()

# Convert date columns to datetime
df_valid['policy start date'] = pd.to_datetime(df_valid['policy start date'], errors='coerce')
df_valid['policy end date'] = pd.to_datetime(df_valid['policy end date'], errors='coerce')

# ---------------------------
# Step 5: Handle Duplicates based on grouping columns
# ---------------------------
def prioritize_trim_group(group):
    base_values = ['2022_base', '2023_base', '2024_base']
    base_rows = group[group['data'].isin(base_values)]
    if not base_rows.empty:
        # Choose the record with the highest total premium payable among base records
        selected = base_rows.sort_values(by='total premium payable', ascending=False).iloc[0]
    else:
        # Otherwise choose the record with the latest policy issue date, then highest total premium payable
        selected = group.sort_values(by=['policy issue date', 'total premium payable'], ascending=[False, False]).iloc[0]
    return selected

def assign_trim_group(group):
    if len(group) > 1:
        selected_row = prioritize_trim_group(group)
    else:
        selected_row = group.iloc[0]
    return selected_row

# Group by the relevant columns and apply duplicate handling
df_final = (
    df_valid
    .groupby(['Cleaned Chassis Number', 'Cleaned Engine Number', 'policy start date', 'policy end date'], group_keys=False)
    .apply(assign_trim_group)
    .reset_index(drop=True)
)

print(len(df_final))

  .apply(assign_trim_group)


220420


In [6]:
# ---------------------------
# Final DataFrame ready for analysis or export
# ---------------------------
df_final.to_sql('dupclean_cleanchassisengine_samechassisno_differregno', con=engine, if_exists='replace', index=False)

260