In [89]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime

# Display settings
pd.set_option("display.max_columns", None)

# File path (update if needed)
file_path = "Dataset\Healthcare Insurance Claims Management Dataset.xlsx"

# Load workbook
xls = pd.ExcelFile(file_path)

# Load all sheets into a dictionary
dfs = {sheet: pd.read_excel(file_path, sheet_name=sheet) for sheet in xls.sheet_names}

dfs.keys()


dict_keys(['Claims_Fact_raw', 'Provider_Master_raw', 'Member_Master_raw', 'Policy_Master_raw', 'Procedure_Master', 'Diagnosis_Master_ICD'])

In [90]:

for name, df in dfs.items():
    print(f"\n===== {name} =====")
    print("Rows:", len(df))
    print("Columns:", df.columns.tolist())



===== Claims_Fact_raw =====
Rows: 12000
Columns: ['ClaimID', 'MemberID', 'PolicyID', 'ProviderID', 'ClaimType', 'ClaimSubmissionDate', 'AdmissionDate', 'DischargeDate', 'ClaimedAmount', 'ApprovedAmount', 'ICDCode(s)', 'ProcedureCode(s)', 'PreAuthNumber', 'PreAuthStatus', 'QueryRaiseDate', 'SettlementDate', 'TPANotes', 'Currency', 'NetworkStatus']

===== Provider_Master_raw =====
Rows: 350
Columns: ['ProviderID', 'ProviderName', 'Location', 'City', 'State', 'NetworkType', 'PackageRates']

===== Member_Master_raw =====
Rows: 9000
Columns: ['MemberID', 'AgeBand', 'Gender', 'City', 'State', 'PolicyID']

===== Policy_Master_raw =====
Rows: 3000
Columns: ['PolicyID', 'PolicyType', 'CoverageLimit', 'StartDate', 'EndDate', 'PremiumAmount']

===== Procedure_Master =====
Rows: 120
Columns: ['ProcedureCode', 'ProcedureCategory', 'StandardRate', 'LOSStandard']

===== Diagnosis_Master_ICD =====
Rows: 60
Columns: ['ICDCode', 'DiagnosisCategory', 'ValidProcedureCategories']


In [91]:
claims = dfs['Claims_Fact_raw']
providers = dfs['Provider_Master_raw']
members = dfs['Member_Master_raw']
policies = dfs['Policy_Master_raw']
procedures = dfs['Procedure_Master']
diagnosis = dfs['Diagnosis_Master_ICD']

In [92]:
audit_log = []

def log_issue(table, column, issue_type, rule, count, sample=None):
    audit_log.append({
        "table": table,
        "column": column,
        "issue_type": issue_type,
        "rule": rule,
        "rows_affected": count,
        "sample_values": sample
    })


In [93]:
# ----------------------------------------------
# 5. Primary Key Duplicate Checks
# ----------------------------------------------

# Claims_Fact: ClaimID
dups = claims[claims.duplicated(subset=['ClaimID'], keep=False)]
log_issue("Claims_Fact", "ClaimID", "Duplicate Key", "ClaimID must be unique", 
          len(dups), dups['ClaimID'].head().tolist())

# Provider
dups = providers[providers.duplicated(subset=['ProviderID'], keep=False)]
log_issue("Provider_Master", "ProviderID", "Duplicate Key", "ProviderID must be unique", 
          len(dups),dups['ProviderID'].head().tolist())

# Member
dups = members[members.duplicated(subset=['MemberID'], keep=False)]
log_issue("Member_Master", "MemberID", "Duplicate Key", "MemberID must be unique", 
          len(dups),dups['MemberID'].head().tolist())

# Policy
dups = policies[policies.duplicated(subset=['PolicyID'], keep=False)]
log_issue("Policy_Master", "PolicyID", "Duplicate Key", "PolicyID must be unique", 
          len(dups),dups['PolicyID'].head().tolist())


In [94]:
# ----------------------------------------------
# 6. Normalize Dates
# ----------------------------------------------
date_cols = [
    'ClaimSubmissionDate','AdmissionDate','DischargeDate','QueryRaiseDate','SettlementDate'
]

for col in date_cols:
    if col in claims.columns:
        claims[col] = pd.to_datetime(claims[col], errors='coerce')


# Date Logic Audit

In [95]:
invalid_stay = claims[claims['DischargeDate'] < claims['AdmissionDate']]
log_issue("Claims_Fact", "DischargeDate", "Invalid Date Order",
          "DischargeDate < AdmissionDate", len(invalid_stay),
          invalid_stay[['ClaimID','AdmissionDate','DischargeDate']].head().to_dict())


In [96]:
invalid_cycle = claims[claims['SettlementDate'] < claims['ClaimSubmissionDate']]
log_issue("Claims_Fact", "SettlementDate", "Invalid Date Order",
          "SettlementDate < ClaimSubmissionDate", len(invalid_cycle),invalid_cycle[['ClaimID','SettlementDate','ClaimSubmissionDate']].head().to_dict())


In [97]:
missing_discharge = claims[claims['DischargeDate'].isna()]
log_issue("Claims_Fact", "DischargeDate", "Missing", "DischargeDate is NULL",
          len(missing_discharge),missing_discharge['ClaimID'].head().to_dict())

missing_settlement = claims[claims['SettlementDate'].isna()]
log_issue("Claims_Fact", "SettlementDate", "Missing", "SettlementDate is NULL",
          len(missing_settlement),missing_settlement['ClaimID'].head().to_dict())


In [98]:
invalid_amounts = claims[claims['ApprovedAmount'] > claims['ClaimedAmount']]
log_issue("Claims_Fact", "ApprovedAmount", "Invalid Amount",
          "ApprovedAmount > ClaimedAmount", len(invalid_amounts),invalid_amounts[['ClaimID','ApprovedAmount','ClaimedAmount']].head().to_dict())


In [99]:
neg_amount = claims[claims['ClaimedAmount'] < 0]
log_issue("Claims_Fact", "ClaimedAmount", "Negative Amount",
          "ClaimedAmount < 0", len(neg_amount),neg_amount['ClaimedAmount'].head().to_dict())


# Outlier Detection (Z-Score)

In [100]:
claims.columns

Index(['ClaimID', 'MemberID', 'PolicyID', 'ProviderID', 'ClaimType',
       'ClaimSubmissionDate', 'AdmissionDate', 'DischargeDate',
       'ClaimedAmount', 'ApprovedAmount', 'ICDCode(s)', 'ProcedureCode(s)',
       'PreAuthNumber', 'PreAuthStatus', 'QueryRaiseDate', 'SettlementDate',
       'TPANotes', 'Currency', 'NetworkStatus'],
      dtype='object')

In [101]:
claims['Procedure_list'] = claims['ProcedureCode(s)'].astype(str).str.split('|')
claims_proc_exploded = claims.explode('Procedure_list')

In [102]:
claims_proc_exploded

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list
0,CLM000001,MBR000418,POL001439,PRV000064,OPD,2023-08-07,2023-08-02,2023-08-02,11032.91,9379.54,B20B|J18V,PRC0014,,Not Required,NaT,NaT,Deductibles applied.,INR,In-Network,PRC0014
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0090
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0025
2,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0102
2,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11998,CLM011999,MBR008814,POL001365,PRV000048,Hospitalization,2025-09-27,2025-09-26,2025-09-26,5021.27,4222.51,M17K,PRC0061,PA-148798,Approved,NaT,2025-10-27,Investigation initiated for high-cost claim.,USD,In-Network,PRC0061
11999,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0038
11999,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0097
11999,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0022


In [103]:
#Merge procedure categories first
claims_proc_exploded = claims_proc_exploded.merge(
    procedures[['ProcedureCode','ProcedureCategory','StandardRate']],
    left_on='Procedure_list', right_on='ProcedureCode',
    how='left'
)

In [104]:
claims_proc_exploded

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list,ProcedureCode,ProcedureCategory,StandardRate
0,CLM000001,MBR000418,POL001439,PRV000064,OPD,2023-08-07,2023-08-02,2023-08-02,11032.91,9379.54,B20B|J18V,PRC0014,,Not Required,NaT,NaT,Deductibles applied.,INR,In-Network,PRC0014,PRC0014,Pharmacy,9580.69
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0090,PRC0090,Diagnostics,1530.24
2,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0025,PRC0025,Consultation,1754.04
3,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0102,PRC0102,RoomCharges,16410.67
4,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0017,PRC0017,RoomCharges,12867.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23969,CLM011999,MBR008814,POL001365,PRV000048,Hospitalization,2025-09-27,2025-09-26,2025-09-26,5021.27,4222.51,M17K,PRC0061,PA-148798,Approved,NaT,2025-10-27,Investigation initiated for high-cost claim.,USD,In-Network,PRC0061,PRC0061,Diagnostics,4792.17
23970,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0038,PRC0038,Diagnostics,4122.71
23971,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0097,PRC0097,Pharmacy,1124.39
23972,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0022,PRC0022,RoomCharges,12392.34


In [105]:
claims['z_score_claim'] = claims_proc_exploded.groupby('ProcedureCategory')['ClaimedAmount']\
                                .transform(lambda x: (x - x.mean())/x.std(ddof=0))

outliers = claims[claims['z_score_claim'].abs() >= 3]

log_issue("Claims_Fact", "ClaimedAmount", "Outlier",
          "Z-score >= 3", len(outliers))


In [106]:
claims

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list,z_score_claim
0,CLM000001,MBR000418,POL001439,PRV000064,OPD,2023-08-07,2023-08-02,2023-08-02,11032.91,9379.54,B20B|J18V,PRC0014,,Not Required,NaT,NaT,Deductibles applied.,INR,In-Network,[PRC0014],-0.375332
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,"[PRC0090, PRC0025]",-0.561791
2,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,"[PRC0102, PRC0017, PRC0074]",-0.523397
3,CLM000004,MBR002753,POL002521,PRV000004,Emergency,2023-06-05,2023-06-03,2023-06-06,65845.72,55665.14,K35V|I21,PRC0087|PRC0096|PRC0038,PA-928278,Approved,NaT,2023-07-05,Deductibles applied.,INR,In-Network,"[PRC0087, PRC0096, PRC0038]",-0.227467
4,CLM000005,MBR008783,POL002459,PRV000261,DayCare,2023-09-06,2023-09-06,2023-09-09,15926.97,15119.08,G43E|C50G,PRC0120|PRC0022,PA-648481,Approved,NaT,2023-09-20,Further documents required.,INR,In-Network,"[PRC0120, PRC0022]",-0.227467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CLM011996,MBR000235,POL000815,PRV000182,Hospitalization,2023-07-30,2023-07-30,2023-08-02,28330.43,23121.90,B20B|J18V|G43J,PRC0036|PRC0091|PRC0120,PA-564018,Approved,2023-08-01,2023-08-13,Awaiting itemized bill.,INR,In-Network,"[PRC0036, PRC0091, PRC0120]",0.307537
11996,CLM011997,MBR006547,POL001856,PRV000256,DayCare,2025-10-05,2025-10-05,2025-10-08,145781.14,123431.88,J18V,PRC0030|PRC0106|PRC0029|PRC0061,PA-177925,Approved,NaT,2025-10-19,Non-payable items excluded.,EUR,In-Network,"[PRC0030, PRC0106, PRC0029, PRC0061]",1.143214
11997,CLM011998,MBR002947,POL000356,PRV000288,Hospitalization,2023-08-10,2023-08-10,2023-08-10,1559.05,1346.83,G43E|A09F,PRC0090,PA-342928,Approved,2023-08-15,2023-08-17,Room rent exceeded eligibility.,INR,In-Network,[PRC0090],0.883388
11998,CLM011999,MBR008814,POL001365,PRV000048,Hospitalization,2025-09-27,2025-09-26,2025-09-26,5021.27,4222.51,M17K,PRC0061,PA-148798,Approved,NaT,2025-10-27,Investigation initiated for high-cost claim.,USD,In-Network,[PRC0061],0.742444


# validate ICD Codes

In [107]:
claims_proc_exploded

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list,ProcedureCode,ProcedureCategory,StandardRate
0,CLM000001,MBR000418,POL001439,PRV000064,OPD,2023-08-07,2023-08-02,2023-08-02,11032.91,9379.54,B20B|J18V,PRC0014,,Not Required,NaT,NaT,Deductibles applied.,INR,In-Network,PRC0014,PRC0014,Pharmacy,9580.69
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0090,PRC0090,Diagnostics,1530.24
2,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0025,PRC0025,Consultation,1754.04
3,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0102,PRC0102,RoomCharges,16410.67
4,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0017,PRC0017,RoomCharges,12867.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23969,CLM011999,MBR008814,POL001365,PRV000048,Hospitalization,2025-09-27,2025-09-26,2025-09-26,5021.27,4222.51,M17K,PRC0061,PA-148798,Approved,NaT,2025-10-27,Investigation initiated for high-cost claim.,USD,In-Network,PRC0061,PRC0061,Diagnostics,4792.17
23970,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0038,PRC0038,Diagnostics,4122.71
23971,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0097,PRC0097,Pharmacy,1124.39
23972,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0022,PRC0022,RoomCharges,12392.34


In [108]:
# ----------------------------------------------
# Validate ICD Codes (multi-code fields)
# ----------------------------------------------

# Step 1: Split ICDCode(s) into list
claims['ICD_list'] = claims['ICDCode(s)'].astype(str).str.split('|')

# Step 2: Explode list into separate rows
claims_exploded = claims.explode('ICD_list')

# Step 3: Validate each ICD code
invalid_icd_rows = claims_exploded[
    ~claims_exploded['ICD_list'].isin(diagnosis['ICDCode'])
]

# Step 4: Group by ClaimID to list invalid codes
invalid_by_claim = invalid_icd_rows.groupby('ClaimID')['ICD_list'].apply(list)

# Step 5: Log issue
log_issue(
    "Claims_Fact",
    "ICDCode(s)",
    "Invalid ICD Code",
    "One or more ICDs do not exist in Diagnosis_Master",
    len(invalid_by_claim),
    invalid_by_claim.head().to_dict()
)

invalid_by_claim.head()

Series([], Name: ICD_list, dtype: object)

# validate Procedure Codes

In [109]:
# ----------------------------------------------------
# Validate Procedure Codes (supports multi-code fields)
# ----------------------------------------------------

# Step 1: Split ProcedureCode(s) into list
# claims['Procedure_list'] = claims['ProcedureCode(s)'].astype(str).str.split('|')

# # Step 2: Explode list into separate rows
# claims_proc_exploded = claims.explode('Procedure_list')

# Step 3: Validate each procedure code against master
invalid_proc_rows = claims_proc_exploded[
    ~claims_proc_exploded['Procedure_list'].isin(procedures['ProcedureCode'])
]

# Step 4: Combine invalid codes per ClaimID
invalid_proc_by_claim = invalid_proc_rows.groupby('ClaimID')['Procedure_list'].apply(list)

#Step 5: Log issue
log_issue(
    "Claims_Fact",
    "ProcedureCode(s)",
    "Invalid Procedure Code",
    "One or more Procedure Codes do not exist in Procedure_Master",
    len(invalid_proc_by_claim),
    invalid_proc_by_claim.head().to_dict()
)

invalid_proc_by_claim.head()

Series([], Name: Procedure_list, dtype: object)

In [110]:
policy_check = claims.merge(
    policies[['PolicyID','StartDate','EndDate']],
    on='PolicyID',
    how='left'
)

invalid_policy = policy_check[
    ((policy_check['ClaimSubmissionDate'] < policy_check['StartDate']) |
      (policy_check['ClaimSubmissionDate'] > policy_check['EndDate']))
]

log_issue("Claims_Fact", "PolicyID", "Invalid Coverage",
          "Admission date outside policy period", len(invalid_policy),invalid_policy[['ClaimSubmissionDate','StartDate','EndDate']].head().to_dict())


In [111]:
len(invalid_policy)

9435

In [112]:
policy_check = claims.merge(
    policies[['PolicyID','StartDate','EndDate']],
    on='PolicyID',
    how='left'
)

valid_policy = policy_check[
    ((policy_check['ClaimSubmissionDate'] >= policy_check['StartDate']) &
      (policy_check['ClaimSubmissionDate'] <= policy_check['EndDate']))
]

log_issue("Claims_Fact", "PolicyID", "valid Coverage",
          "Admission date within policy period", len(valid_policy),valid_policy[['ClaimSubmissionDate','StartDate','EndDate']].head().to_dict())

In [113]:
len(valid_policy)

2565

# Package Rate Cross-Checks

In [114]:
claims_pkg = claims.merge(
    providers[['ProviderID', 'NetworkType', 'PackageRates']],
    on='ProviderID',
    how='left'
)


In [115]:
#claims['Procedure_list'] = claims['ProcedureCode(s)'].astype(str).str.split('|')
#claims_proc_exploded = claims.explode('Procedure_list')
# claims_proc_exploded = claims_proc_exploded.merge(
#     procedures[['ProcedureCode', 'StandardRate']],
#     left_on='Procedure_list',
#     right_on='ProcedureCode',
#     how='left'
# )
claims_proc_exploded

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list,ProcedureCode,ProcedureCategory,StandardRate
0,CLM000001,MBR000418,POL001439,PRV000064,OPD,2023-08-07,2023-08-02,2023-08-02,11032.91,9379.54,B20B|J18V,PRC0014,,Not Required,NaT,NaT,Deductibles applied.,INR,In-Network,PRC0014,PRC0014,Pharmacy,9580.69
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0090,PRC0090,Diagnostics,1530.24
2,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,PRC0025,PRC0025,Consultation,1754.04
3,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0102,PRC0102,RoomCharges,16410.67
4,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,PRC0017,PRC0017,RoomCharges,12867.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23969,CLM011999,MBR008814,POL001365,PRV000048,Hospitalization,2025-09-27,2025-09-26,2025-09-26,5021.27,4222.51,M17K,PRC0061,PA-148798,Approved,NaT,2025-10-27,Investigation initiated for high-cost claim.,USD,In-Network,PRC0061,PRC0061,Diagnostics,4792.17
23970,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0038,PRC0038,Diagnostics,4122.71
23971,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0097,PRC0097,Pharmacy,1124.39
23972,CLM012000,MBR005075,POL000130,PRV000247,Maternity,2023-10-11,2023-10-11,2023-10-14,34593.01,29326.80,C50,PRC0038|PRC0097|PRC0022|PRC0036,PA-337705,Approved,NaT,2023-11-10,Pre-auth approved as per policy.,INR,In-Network,PRC0022,PRC0022,RoomCharges,12392.34


In [116]:
standard_sum = claims_proc_exploded.groupby('ClaimID')['StandardRate'].sum().reset_index()
standard_sum = standard_sum.rename(columns={"StandardRate": "TotalStandardRate"})

claims_std = claims.merge(standard_sum, on='ClaimID', how='left')

claims_std['min_allowed'] = claims_std['TotalStandardRate'] * 0.9
claims_std['max_allowed'] = claims_std['TotalStandardRate'] * 1.1


package_mismatch = claims_std[
    (claims_std['TotalStandardRate'].notna()) &
    (
        (claims_std['ClaimedAmount'] < claims_std['min_allowed']) |
        (claims_std['ClaimedAmount'] > claims_std['max_allowed'])
    )
]


log_issue(
    "Claims_Fact",
    "ClaimedAmount",
    "StandardRate Package Mismatch",
    "ClaimedAmount not within ±10% of total StandardRate for procedures",
    len(package_mismatch),
    package_mismatch[['ClaimID', 'ClaimedAmount', 'TotalStandardRate', 'min_allowed', 'max_allowed']].head().to_dict()
)


In [117]:
claims.columns

Index(['ClaimID', 'MemberID', 'PolicyID', 'ProviderID', 'ClaimType',
       'ClaimSubmissionDate', 'AdmissionDate', 'DischargeDate',
       'ClaimedAmount', 'ApprovedAmount', 'ICDCode(s)', 'ProcedureCode(s)',
       'PreAuthNumber', 'PreAuthStatus', 'QueryRaiseDate', 'SettlementDate',
       'TPANotes', 'Currency', 'NetworkStatus', 'Procedure_list',
       'z_score_claim', 'ICD_list'],
      dtype='object')

In [118]:
claims["SettlementDate"].isnull().sum()

np.int64(1234)

# Out of network variance vs in-network

In [119]:
claims_std

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list,z_score_claim,ICD_list,TotalStandardRate,min_allowed,max_allowed
0,CLM000001,MBR000418,POL001439,PRV000064,OPD,2023-08-07,2023-08-02,2023-08-02,11032.91,9379.54,B20B|J18V,PRC0014,,Not Required,NaT,NaT,Deductibles applied.,INR,In-Network,[PRC0014],-0.375332,"[B20B, J18V]",9580.69,8622.621,10538.759
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,"[PRC0090, PRC0025]",-0.561791,[A09T],3284.28,2955.852,3612.708
2,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,"[PRC0102, PRC0017, PRC0074]",-0.523397,[C50],29723.35,26751.015,32695.685
3,CLM000004,MBR002753,POL002521,PRV000004,Emergency,2023-06-05,2023-06-03,2023-06-06,65845.72,55665.14,K35V|I21,PRC0087|PRC0096|PRC0038,PA-928278,Approved,NaT,2023-07-05,Deductibles applied.,INR,In-Network,"[PRC0087, PRC0096, PRC0038]",-0.227467,"[K35V, I21]",55913.66,50322.294,61505.026
4,CLM000005,MBR008783,POL002459,PRV000261,DayCare,2023-09-06,2023-09-06,2023-09-09,15926.97,15119.08,G43E|C50G,PRC0120|PRC0022,PA-648481,Approved,NaT,2023-09-20,Further documents required.,INR,In-Network,"[PRC0120, PRC0022]",-0.227467,"[G43E, C50G]",15498.65,13948.785,17048.515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CLM011996,MBR000235,POL000815,PRV000182,Hospitalization,2023-07-30,2023-07-30,2023-08-02,28330.43,23121.90,B20B|J18V|G43J,PRC0036|PRC0091|PRC0120,PA-564018,Approved,2023-08-01,2023-08-13,Awaiting itemized bill.,INR,In-Network,"[PRC0036, PRC0091, PRC0120]",0.307537,"[B20B, J18V, G43J]",24646.00,22181.400,27110.600
11996,CLM011997,MBR006547,POL001856,PRV000256,DayCare,2025-10-05,2025-10-05,2025-10-08,145781.14,123431.88,J18V,PRC0030|PRC0106|PRC0029|PRC0061,PA-177925,Approved,NaT,2025-10-19,Non-payable items excluded.,EUR,In-Network,"[PRC0030, PRC0106, PRC0029, PRC0061]",1.143214,[J18V],139541.42,125587.278,153495.562
11997,CLM011998,MBR002947,POL000356,PRV000288,Hospitalization,2023-08-10,2023-08-10,2023-08-10,1559.05,1346.83,G43E|A09F,PRC0090,PA-342928,Approved,2023-08-15,2023-08-17,Room rent exceeded eligibility.,INR,In-Network,[PRC0090],0.883388,"[G43E, A09F]",1530.24,1377.216,1683.264
11998,CLM011999,MBR008814,POL001365,PRV000048,Hospitalization,2025-09-27,2025-09-26,2025-09-26,5021.27,4222.51,M17K,PRC0061,PA-148798,Approved,NaT,2025-10-27,Investigation initiated for high-cost claim.,USD,In-Network,[PRC0061],0.742444,[M17K],4792.17,4312.953,5271.387


In [120]:
network_summary = claims_std.groupby('NetworkStatus')['ClaimedAmount'].mean()
in_net_avg = network_summary.get('In-Network', None)

In [121]:
network_summary

NetworkStatus
In-Network        54089.031499
Out-of-Network    54756.415731
Name: ClaimedAmount, dtype: float64

In [122]:
in_net_avg

np.float64(54089.03149895982)

In [123]:
if in_net_avg is not None:
    claims_std['out_net_variance_flag'] = (
        (claims_std['NetworkStatus'] == 'Out-of-Network') &
        (claims_std['ClaimedAmount'] > in_net_avg * 1.20) # 54089.031 * 1.20 = 64906.8372
    )
else:
    claims_std['out_net_variance_flag'] = False

In [124]:
# Boolean Filtering df[condition]
out_net_variance_claims = claims_std[claims_std['out_net_variance_flag']]

In [125]:
log_issue(
    "Claims_Fact",
    "ClaimedAmount",
    "Out-of-Network Variance",
    "Out-of-Network ClaimedAmount > 120% of In-Network average",
    len(out_net_variance_claims),
    out_net_variance_claims[['ClaimID','ClaimedAmount','NetworkStatus']].head().to_dict()
)


In [126]:
claims_std.head(23)

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list,z_score_claim,ICD_list,TotalStandardRate,min_allowed,max_allowed,out_net_variance_flag
0,CLM000001,MBR000418,POL001439,PRV000064,OPD,2023-08-07,2023-08-02,2023-08-02,11032.91,9379.54,B20B|J18V,PRC0014,,Not Required,NaT,NaT,Deductibles applied.,INR,In-Network,[PRC0014],-0.375332,"[B20B, J18V]",9580.69,8622.621,10538.759,False
1,CLM000002,MBR001547,POL000009,PRV000281,Hospitalization,2024-12-12,2024-12-02,2024-12-02,3585.74,2825.01,A09T,PRC0090|PRC0025,,Not Required,NaT,2024-12-26,Room rent exceeded eligibility.,INR,Out-of-Network,"[PRC0090, PRC0025]",-0.561791,[A09T],3284.28,2955.852,3612.708,False
2,CLM000003,MBR002778,POL002455,PRV000083,Hospitalization,2025-06-21,2025-06-20,2025-06-23,35010.21,29763.68,C50,PRC0102|PRC0017|PRC0074,PA-654823,Approved,NaT,NaT,Claim under medical review.,INR,In-Network,"[PRC0102, PRC0017, PRC0074]",-0.523397,[C50],29723.35,26751.015,32695.685,False
3,CLM000004,MBR002753,POL002521,PRV000004,Emergency,2023-06-05,2023-06-03,2023-06-06,65845.72,55665.14,K35V|I21,PRC0087|PRC0096|PRC0038,PA-928278,Approved,NaT,2023-07-05,Deductibles applied.,INR,In-Network,"[PRC0087, PRC0096, PRC0038]",-0.227467,"[K35V, I21]",55913.66,50322.294,61505.026,False
4,CLM000005,MBR008783,POL002459,PRV000261,DayCare,2023-09-06,2023-09-06,2023-09-09,15926.97,15119.08,G43E|C50G,PRC0120|PRC0022,PA-648481,Approved,NaT,2023-09-20,Further documents required.,INR,In-Network,"[PRC0120, PRC0022]",-0.227467,"[G43E, C50G]",15498.65,13948.785,17048.515,False
5,CLM000006,MBR007793,POL000644,PRV000070,Hospitalization,2023-10-13,2023-10-11,2023-10-11,1093.58,838.92,G40L,PRC0092,,Not Required,NaT,2023-10-20,Investigation initiated for high-cost claim.,INR,Out-of-Network,[PRC0092],-0.056357,[G40L],1174.07,1056.663,1291.477,False
6,CLM000007,MBR006831,POL000979,PRV000164,Hospitalization,2025-02-21,2025-02-20,2025-02-23,13163.49,12040.09,C50G,PRC0018|PRC0084,PA-458318,Approved,NaT,2025-02-28,Deductibles applied.,INR,In-Network,"[PRC0018, PRC0084]",0.21308,[C50G],13669.59,12302.631,15036.549,False
7,CLM000008,MBR004416,POL001707,PRV000003,Emergency,2023-05-09,2023-05-08,2023-05-15,196014.6,165146.42,M17F,PRC0058|PRC0068,PA-706205,Approved,2023-05-16,2023-05-16,Non-payable items excluded.,INR,In-Network,"[PRC0058, PRC0068]",-0.094273,[M17F],209815.63,188834.067,230797.193,False
8,CLM000009,MBR006056,POL001388,PRV000170,Hospitalization,2025-04-16,2025-04-16,2025-04-23,270121.44,228712.38,A09F|S83,PRC0077|PRC0108|PRC0055,PA-183662,Pending,NaT,2025-04-23,Awaiting itemized bill.,INR,In-Network,"[PRC0077, PRC0108, PRC0055]",0.296084,"[A09F, S83]",257871.21,232084.089,283658.331,False
9,CLM000010,MBR005463,POL002410,PRV000114,Hospitalization,2023-02-17,2023-02-05,2023-02-12,43846.84,33698.34,M17|G40,PRC0081|PRC0071,,Not Required,2023-02-22,2023-03-03,Awaiting itemized bill.,INR,Out-of-Network,"[PRC0081, PRC0071]",-0.391742,"[M17, G40]",40866.4,36779.76,44953.04,False


In [127]:
out_net_variance_claims

Unnamed: 0,ClaimID,MemberID,PolicyID,ProviderID,ClaimType,ClaimSubmissionDate,AdmissionDate,DischargeDate,ClaimedAmount,ApprovedAmount,ICDCode(s),ProcedureCode(s),PreAuthNumber,PreAuthStatus,QueryRaiseDate,SettlementDate,TPANotes,Currency,NetworkStatus,Procedure_list,z_score_claim,ICD_list,TotalStandardRate,min_allowed,max_allowed,out_net_variance_flag
22,CLM000023,MBR003460,POL001707,PRV000262,Maternity,2023-12-06,2023-11-26,2023-12-01,326133.83,249738.95,I21,PRC0077|PRC0015|PRC0105,,Not Required,NaT,2023-12-13,Room rent exceeded eligibility.,INR,Out-of-Network,"[PRC0077, PRC0015, PRC0105]",1.812018,[I21],346696.87,312027.183,381366.557,True
30,CLM000031,MBR002158,POL000092,PRV000199,Hospitalization,2024-08-15,2024-08-06,2024-08-13,93574.65,66319.07,G40|M17,PRC0035|PRC0066|PRC0119,,Not Required,2024-08-25,2024-08-22,Claim under medical review.,INR,Out-of-Network,"[PRC0035, PRC0066, PRC0119]",-0.521375,"[G40, M17]",80790.92,72711.828,88870.012,True
51,CLM000052,MBR002262,POL000957,PRV000231,Hospitalization,2024-05-30,2024-05-25,2024-05-28,94358.34,70251.34,N20S,PRC0093|PRC0084|PRC0001,,Not Required,2024-06-04,2024-06-29,Non-payable items excluded.,INR,Out-of-Network,"[PRC0093, PRC0084, PRC0001]",-1.034079,[N20S],82091.43,73882.287,90300.573,True
70,CLM000071,MBR004619,POL001003,PRV000128,Hospitalization,2024-03-18,2024-03-06,2024-03-13,190028.60,141624.20,M17F,PRC0048|PRC0117|PRC0059,,Not Required,NaT,2024-05-02,Requested discharge summary.,INR,Out-of-Network,"[PRC0048, PRC0117, PRC0059]",-0.534319,[M17F],201459.51,181313.559,221605.461,True
77,CLM000078,MBR007645,POL000742,PRV000140,DayCare,2024-08-22,2024-08-15,2024-08-20,232882.10,164371.32,M17K|G43J,PRC0003,,Not Required,NaT,2024-08-29,Requested discharge summary.,EUR,Out-of-Network,[PRC0003],2.276689,"[M17K, G43J]",226211.94,203590.746,248833.134,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11824,CLM011825,MBR004051,POL000811,PRV000241,DayCare,2024-04-27,2024-04-19,2024-04-22,90821.71,71432.93,J18T|N39,PRC0064|PRC0021|PRC0087,,Not Required,2024-05-04,2024-05-04,Investigation initiated for high-cost claim.,INR,Out-of-Network,"[PRC0064, PRC0021, PRC0087]",-0.652065,"[J18T, N39]",89657.90,80692.110,98623.690,True
11956,CLM011957,MBR002362,POL002127,PRV000198,Hospitalization,2023-06-22,2023-06-17,2023-06-20,114915.66,78499.35,A09T,PRC0026|PRC0110|PRC0062|PRC0090,,Not Required,NaT,2023-06-29,Claim under medical review.,EUR,Out-of-Network,"[PRC0026, PRC0110, PRC0062, PRC0090]",3.349622,[A09T],101961.76,91765.584,112157.936,True
11972,CLM011973,MBR004767,POL000277,PRV000183,DayCare,2023-05-15,2023-05-06,2023-05-13,294731.11,227651.65,M17F,PRC0051|PRC0019,,Not Required,NaT,2023-06-05,Room rent exceeded eligibility.,INR,Out-of-Network,"[PRC0051, PRC0019]",-0.628555,[M17F],252399.40,227159.460,277639.340,True
11973,CLM011974,MBR007924,POL000782,PRV000036,Hospitalization,2023-07-31,2023-07-23,2023-07-26,90387.11,64524.53,I21Y|G43|G40M,PRC0065,,Not Required,NaT,2023-08-30,Non-payable items excluded.,INR,Out-of-Network,[PRC0065],0.008421,"[I21Y, G43, G40M]",86541.89,77887.701,95196.079,True


In [128]:
# Normalize PreAuthStatus to avoid case mismatches
claims['PreAuthStatus'] = claims['PreAuthStatus'].astype(str).str.upper().str.strip()

# Define rejection keywords
rejection_keywords = ["REJECT", "DENY", "NOT APPROVED", "DECLINED"]

# Flag based on ApprovedAmount
flag_amt = (claims['ApprovedAmount'].fillna(0) == 0).astype(int)

# Flag based on PreAuthStatus text
flag_pre_auth = claims['PreAuthStatus'].apply(
    lambda x: 1 if any(keyword in x for keyword in rejection_keywords) else 0
)

# (Optional) TPANotes rejection check
claims['TPANotes'] = claims['TPANotes'].astype(str).str.upper().str.strip()
flag_notes = claims['TPANotes'].apply(
    lambda x: 1 if any(keyword in x for keyword in rejection_keywords) else 0
)

# Final Rejected Flag
claims['Rejected_flag'] = (
    flag_amt | flag_pre_auth | flag_notes
).astype(int)

sample_vals = claims.loc[claims['Rejected_flag'] == 1, 'ClaimID'].head(5).tolist()

# Log issue count for audit
log_issue("Claims_Fact", "Rejected_flag", "Rejection Flag",
          "ApprovedAmount=0 or PreAuthStatus/TPANotes indicates rejection",
          claims['Rejected_flag'].sum(),sample_vals)


In [129]:
audit_df = pd.DataFrame(audit_log)
audit_df.to_csv("audit_log.csv", index=False)
audit_df.head(20)

Unnamed: 0,table,column,issue_type,rule,rows_affected,sample_values
0,Claims_Fact,ClaimID,Duplicate Key,ClaimID must be unique,0,[]
1,Provider_Master,ProviderID,Duplicate Key,ProviderID must be unique,0,[]
2,Member_Master,MemberID,Duplicate Key,MemberID must be unique,0,[]
3,Policy_Master,PolicyID,Duplicate Key,PolicyID must be unique,0,[]
4,Claims_Fact,DischargeDate,Invalid Date Order,DischargeDate < AdmissionDate,0,"{'ClaimID': {}, 'AdmissionDate': {}, 'Discharg..."
5,Claims_Fact,SettlementDate,Invalid Date Order,SettlementDate < ClaimSubmissionDate,0,"{'ClaimID': {}, 'SettlementDate': {}, 'ClaimSu..."
6,Claims_Fact,DischargeDate,Missing,DischargeDate is NULL,0,{}
7,Claims_Fact,SettlementDate,Missing,SettlementDate is NULL,1234,"{0: 'CLM000001', 2: 'CLM000003', 19: 'CLM00002..."
8,Claims_Fact,ApprovedAmount,Invalid Amount,ApprovedAmount > ClaimedAmount,0,"{'ClaimID': {}, 'ApprovedAmount': {}, 'Claimed..."
9,Claims_Fact,ClaimedAmount,Negative Amount,ClaimedAmount < 0,0,{}


In [130]:
claims_std.to_csv("Claims_Fact_clean.csv", index=False)
# providers.to_csv("Provider_Master_clean.csv", index=False)
# members.to_csv("Member_Master_clean.csv", index=False)
# policies.to_csv("Policy_Master_clean.csv", index=False)