In [None]:
import pandas as pd
import re

# Load the CSV file and explicitly set large number columns as strings
df = pd.read_excel('Claim.xlsx')

# Function to clean names
def clean_name(name):
    # Remove special characters and spaces, and convert to lowercase
    return re.sub(r'[^a-zA-Z0-9]', '', str(name)).lower()

# Create a new column with the cleaned INSUREDNAME
df['CLEANED_INSUREDNAME'] = df['Insured Name'].apply(clean_name)

# Save the cleaned data with both old and cleaned INSUREDNAME to a new CSV file
df.to_csv('cleaned_claim_with_old_and_new.csv', index=False)

# Print the first few rows of the DataFrame with old and cleaned INSUREDNAME
print(df[['Insured Name', 'CLEANED_INSUREDNAME']].head())

                        Insured Name           CLEANED_INSUREDNAME
0   .  ASHUTOSH PARSHOTTAMBHAI PATEL   ashutoshparshottambhaipatel
1  .  MADIVALAPPAGOUDA S POLICEPATIL  madivalappagoudaspolicepatil
2              .  S JIMMY LAWRENCE .                sjimmylawrence
3                 .  SYED NIZAMUDDIN                syednizamuddin
4                 .  SYED NIZAMUDDIN                syednizamuddin


In [None]:
import pandas as pd
df = pd.read_excel('cleaned_claim_with_old_and_new.xlsx')

KeyboardInterrupt: 

In [None]:
# Step 1: Ensure the columns used for grouping have consistent data types
group_cols = ['CLEANED_INSUREDNAME', 'Policy No (Str)', 'Interest Level', 'Product', 'Policy Start Date', 'Policy End Date',
              'Make', 'Model', 'Variant', 'Vehicle Registration No.', 'Engine NO', 'Chassis NO', 'Manf. Year']

df[group_cols] = df[group_cols].astype(str)

# Step 2: Count the number of claims for each group
df['Number of claims'] = df.groupby(group_cols)['Policy No (Str)'].transform('count')

# Step 3: Create the columns for PAID, WITHDRAWN, CLOSURE OF CLAIM, REPUDIATION, CLOSURE OF CLAIMS
status_cols = ['PAID', 'WITHDRAWN', 'CLOSURE OF CLAIM', 'REPUDIATION', 'CLOSURE OF CLAIMS']

# Before removing duplicates, count the occurrences of each status
df_status = df.groupby(group_cols)['Updated Status'].value_counts().unstack().fillna(0).astype(int)

# Rename the columns to match your required output format
df_status = df_status.rename(columns={
    'PAID': 'PAID',
    'WITHDRAWN': 'WITHDRAWN',
    'CLOSURE OF CLAIM': 'CLOSURE OF CLAIM',
    'REPUDIATION': 'REPUDIATION',
    'CLOSURE OF CLAIMS': 'CLOSURE OF CLAIMS'
})

# Step 4: Convert 'Settle date' to datetime, and coerce errors
df['Settle date'] = pd.to_datetime(df['Settle date'], errors='coerce')

# Step 5: Sort by 'Settle date' and select the latest row for each group
df_latest = df.sort_values(by='Settle date').drop_duplicates(subset=group_cols, keep='last')

# Step 6: Merge the status counts back into the original DataFrame to retain all columns
df_final = pd.merge(df_latest, df_status, on=group_cols, how='left')

# Step 7: Select ALL columns + the calculated columns
all_cols = df.columns.tolist() + status_cols + ['Number of claims']
df_final = df_final[all_cols]


df_final.to_csv('cleaned_claim(final!).csv', index=False)


In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_excel('cleaned_claim_with_old_and_new.xlsx')

In [2]:
# Step 1: Ensure the columns used for grouping have consistent data types
group_cols = ['CLEANED_INSUREDNAME', 'Policy No (Str)', 'Policy Start Date', 'Policy End Date',
              'Make', 'Model']

df[group_cols] = df[group_cols].astype(str)

# Step 2: Count the number of claims for each group
df['Number of claims'] = df.groupby(group_cols)['Policy No (Str)'].transform('count')

# Step 3: Create the columns for PAID, WITHDRAWN, CLOSURE OF CLAIM, REPUDIATION, CLOSURE OF CLAIMS
status_cols = ['PAID', 'WITHDRAWN', 'CLOSURE OF CLAIM', 'REPUDIATION', 'CLOSURE OF CLAIMS']

# Before removing duplicates, count the occurrences of each status
df_status = df.groupby(group_cols)['Updated Status'].value_counts().unstack().fillna(0).astype(int)

# Rename the columns to match your required output format
df_status = df_status.rename(columns={
    'PAID': 'PAID',
    'WITHDRAWN': 'WITHDRAWN',
    'CLOSURE OF CLAIM': 'CLOSURE OF CLAIM',
    'REPUDIATION': 'REPUDIATION',
    'CLOSURE OF CLAIMS': 'CLOSURE OF CLAIMS'
})

# Step 4: Convert 'Settle date' to datetime, and coerce errors
df['Settle date'] = pd.to_datetime(df['Settle date'], errors='coerce')

# Step 5: Sort by 'Settle date' and select the latest row for each group
df_latest = df.sort_values(by='Settle date').drop_duplicates(subset=group_cols, keep='last')

# Step 6: Handling duplicates based on 'Settle date'
def resolve_duplicates(group):
    if len(group) > 1:
        # Count the null values
        group['null_count'] = group.isnull().sum(axis=1)
        # Select the row with the fewest nulls
        return group.loc[group['null_count'].idxmin()]
    else:
        return group.iloc[0]

df_latest_cleaned = df_latest.groupby(group_cols).apply(resolve_duplicates).reset_index(drop=True)

# Step 7: Merge the status counts back into the filtered DataFrame to retain all columns
df_final = pd.merge(df_latest_cleaned, df_status, on=group_cols, how='left')

# Step 8: Select ALL columns + the calculated columns
all_cols = df.columns.tolist() + status_cols + ['Number of claims']
df_final = df_final[all_cols]

# Step 9: Replace NaN values with empty strings ('') before saving
df_final = df_final.replace(np.nan, '', regex=True)

# Save the final DataFrame to a CSV file with empty cells for null values
df_final.to_csv('cleaned_claim_final!!!!.csv', index=False, na_rep='')