In [130]:
import pandas as pd
import numpy as np
from datetime import timedelta

# Load all tables
path = "Dataset\Healthcare Insurance Claims Management Dataset.xlsx"

claims = pd.read_excel(path, sheet_name='Claims_Fact_raw', parse_dates=[
    'ClaimSubmissionDate','AdmissionDate','DischargeDate','QueryRaiseDate','SettlementDate'
])
procedures = pd.read_excel(path, sheet_name='Procedure_Master')
policies = pd.read_excel(path, sheet_name='Policy_Master_raw')
providers = pd.read_excel(path, sheet_name='Provider_Master_raw')
diagnosis = pd.read_excel(path, sheet_name='Diagnosis_Master_ICD')

# Normalize numeric
claims['ClaimedAmount'] = pd.to_numeric(claims['ClaimedAmount'], errors='coerce')
claims['ApprovedAmount'] = pd.to_numeric(claims['ApprovedAmount'], errors='coerce')

## Compute LOS (Length of Stay)

In [131]:
claims['LOS_days'] = (claims['DischargeDate'] - claims['AdmissionDate']).dt.days

## Claim Cycle Time

In [132]:
claims['ClaimCycleTime_days'] = (claims['SettlementDate'] - claims['ClaimSubmissionDate']).dt.days

## Amount Variance & Variance %

In [133]:
claims['AmountVariance'] = claims['ClaimedAmount'] - claims['ApprovedAmount']

In [134]:
claims['VariancePercent'] = np.where(
    (claims['ClaimedAmount'].notna()) & (claims['ClaimedAmount'] != 0),
    (claims['ApprovedAmount'] / claims['ClaimedAmount']) * 100,
    np.nan
)

## Compute TotalStandardRate for Multi-Procedure Claims

In [135]:
#Split ProcedureCode(s)
claims['Procedure_list'] = claims['ProcedureCode(s)'].astype(str).str.split('|')

In [136]:
claims_proc_exp = claims.explode('Procedure_list')

In [137]:
claims_proc_exp = claims_proc_exp.merge(
    procedures[['ProcedureCode', 'StandardRate', 'LOSStandard', 'ProcedureCategory']],
    left_on='Procedure_list',
    right_on='ProcedureCode',
    how='left'
)

In [138]:
# Sum StandardRates per ClaimID
std_sum = claims_proc_exp.groupby('ClaimID')['StandardRate'].sum().reset_index()
std_sum = std_sum.rename(columns={'StandardRate':'TotalStandardRate'})

In [139]:
claims = claims.merge(std_sum, on='ClaimID', how='left')

## Excess LOS Flag (LOS vs Standard LOS)

In [140]:
los_std = claims_proc_exp.groupby('ClaimID')['LOSStandard'].mean().reset_index()
los_std_2 = claims_proc_exp.groupby('ClaimID')['LOSStandard'].sum().reset_index()
los_std = los_std.rename(columns={'LOSStandard':'AvgLOSStandard'})
los_std_2 = los_std_2.rename(columns={'LOSStandard':'TotLOSStandard'})
claims = claims.merge(los_std, on='ClaimID', how='left')
claims = claims.merge(los_std_2, on='ClaimID', how='left')

In [141]:
#Excess LOS if LOS > 150% of standard
claims['ExcessLOS_flag'] = np.where(
    (claims['LOS_days'].notna()) & (claims['AvgLOSStandard'].notna()) &
    (claims['LOS_days'] > 1.5 * claims['AvgLOSStandard']),
    1, 0
)

In [142]:
len(claims[claims["ExcessLOS_flag"]==1])

3705

# High Bill Flag

In [143]:
claims['HighBill_flag'] = np.where(
    (claims['ClaimedAmount'] > 1.10 * claims['TotalStandardRate']),
    1, 0
)

In [144]:
len(claims[claims["HighBill_flag"]==1])

4049

# Repeat Claims (Member submitting multiple claims within 30 days)

In [145]:
claims = claims.sort_values(['MemberID', 'AdmissionDate'])

claims['PrevClaimDate'] = claims.groupby('MemberID')['AdmissionDate'].shift(1)

claims['RepeatClaim_flag'] = np.where(
    (claims['PrevClaimDate'].notna()) &
    ((claims['AdmissionDate'] - claims['PrevClaimDate']).dt.days <= 30),
    1, 0
)

In [146]:
len(claims[claims["RepeatClaim_flag"]==1])

452

# Coding Consistency Score

In [147]:
claims['ICD_list'] = claims['ICDCode(s)'].astype(str).str.split('|')

In [148]:
claims_proc = claims.explode('Procedure_list')

In [149]:
claims_proc_icd = claims_proc.explode('ICD_list')

In [150]:
#Merge Procedure_Master
claims_proc_icd = claims_proc_icd.merge(
    procedures[['ProcedureCode', 'ProcedureCategory', 'StandardRate', 'LOSStandard']],
    left_on='Procedure_list',
    right_on='ProcedureCode',
    how='left'
)
#Merge Diagnosis_Master
claims_proc_icd = claims_proc_icd.merge(
    diagnosis[['ICDCode', 'DiagnosisCategory', 'ValidProcedureCategories']],
    left_on='ICD_list',
    right_on='ICDCode',
    how='left'
)


In [151]:
claims_proc_icd['ValidProcCat_list'] = claims_proc_icd['ValidProcedureCategories'].astype(str).str.split(',')

In [152]:
# claims_proc_icd.to_csv("claims_proc_icd.csv", index=False)

In [153]:
combined_valid = claims_proc_icd.groupby(
    ['ClaimID', 'Procedure_list']
)['ValidProcCat_list'].sum().reset_index()


In [154]:
# Remove duplicates inside combined list
combined_valid['ValidProcCat_list'] = combined_valid['ValidProcCat_list'].apply(lambda x: list(set(x)))

In [155]:
claims_proc_final = claims_proc.merge(
    combined_valid,
    on=['ClaimID', 'Procedure_list'],
    how='left'
)

In [156]:
claims_proc_final = claims_proc_final.merge(
    procedures[['ProcedureCode','ProcedureCategory']],
    left_on='Procedure_list',
    right_on='ProcedureCode',
    how='left'
)

In [157]:
# claims_proc_final.to_csv("claims_proc_final.csv", index=False)

In [158]:
def check_icd_proc(row):
    try:
        return 0 if row['ProcedureCategory'] in row['ValidProcCat_list'] else 1
    except:
        return 1

claims_proc_final['ICDProcMismatch'] = claims_proc_final.apply(check_icd_proc, axis=1)


In [159]:
# claims_proc_icd.to_csv("claims_proc_icd.csv", index=False)

In [160]:
coding_score = claims_proc_final.groupby('ClaimID')['ICDProcMismatch'].sum().reset_index()
coding_score['CodingConsistencyScore'] = 100 - coding_score['ICDProcMismatch'] * 30
coding_score['CodingConsistencyScore'] = coding_score['CodingConsistencyScore'].clip(lower=0)

In [161]:
claims = claims.merge(
    coding_score[['ClaimID','CodingConsistencyScore']],
    on='ClaimID',
    how='left'
)

In [162]:
claims['FraudRiskScore'] = (
    claims['ExcessLOS_flag'] * 30 +
    claims['HighBill_flag'] * 40 +
    claims['RepeatClaim_flag'] * 25 +
    claims['VariancePercent'].apply(lambda x: 10 if x < 60 else 0)
)

claims['FraudRiskScore'] = claims['FraudRiskScore'].clip(upper=100)


In [163]:
provider_metrics = claims.groupby('ProviderID').agg({
    'ClaimCycleTime_days':'mean',
    'ExcessLOS_flag':'mean',
    'HighBill_flag':'mean'
}).reset_index()

provider_metrics['EfficiencyScore'] = (
    100
    - provider_metrics['ClaimCycleTime_days'] * 0.2
    - provider_metrics['ExcessLOS_flag'] * 20
    - provider_metrics['HighBill_flag'] * 30
)

provider_metrics['EfficiencyScore'] = provider_metrics['EfficiencyScore'].clip(lower=0)

In [164]:
claims = claims.merge(provider_metrics[['ProviderID','EfficiencyScore']], on='ProviderID', how='left')

# Missing Documents

In [165]:
# Normalize TPANotes
claims['TPANotes'] = claims['TPANotes'].astype(str).str.upper().str.strip()

# Missing documents flag
claims['MissingDocuments_flag'] = claims['TPANotes'].apply(
    lambda x: 1 if "FURTHER DOCUMENTS REQUIRED" in x else 0
)

In [166]:
claims['FraudRiskScore'] += claims['MissingDocuments_flag'] * 10
claims['FraudRiskScore'] = claims['FraudRiskScore'].clip(upper=100)

In [167]:
claims.to_csv("Claims_Fact_enriched_FULL_FEATURES.csv", index=False)
print("Feature engineering completed. File saved.")

Feature engineering completed. File saved.
