In [1]:
import pandas as pd

# Step 1: Load data from Excel file
df = pd.read_excel("PR_2022.xlsx")

# Step 1a: Filter rows where 'NOP' is equal to 1
df = df[df['NOP'] == 1]

# Step 2: Convert date columns to datetime
date_columns = ['Policy Start Date', 'Policy End Date', 'Policy Issue date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Step 3: Define the function to handle duplicates
def prioritize_duplicates(group):
    # Step 3a: Select the row(s) with the latest Policy Issue Date
    latest_issue_date = group['Policy Issue date'].max()
    latest_rows = group[group['Policy Issue date'] == latest_issue_date]
    
    # Step 3b: Exclude rows with negative Net Premium values
    positive_premium_rows = latest_rows[latest_rows['Net Premium'] >= 0]
    
    if not positive_premium_rows.empty:
        # Step 3c: If positive Net Premium rows exist, select the one with the highest value
        return positive_premium_rows.loc[positive_premium_rows['Net Premium'].idxmax()]
    else:
        # Step 3d: If all Net Premium values are negative or no positive premiums, select the first row
        return latest_rows.iloc[0]

# Step 4: Handle duplicates
df_cleaned = (
    df.groupby(['Policy Number', 'Policy Start Date', 'Policy End Date'], group_keys=False)
    .apply(prioritize_duplicates)
    .reset_index(drop=True)
)

# Step 5: Save the cleaned dataset to CSV
df_cleaned.to_csv("cleaned_2022_PR.csv", index=False)

print("Cleaned dataset saved to 'cleaned_2024_PR.csv'")

  .apply(prioritize_duplicates)


Cleaned dataset saved to 'cleaned_2024_PR.csv'
