In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("🚀 Loading training_data.csv...")
df = pd.read_csv('/content/training_data.csv', dtype={'ClaimID': str, 'BeneID': str, 'Provider': str})

print(f"Loaded {len(df)} rows.")

date_columns = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt', 'DOB']
for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='coerce')

df['Total_Claims_Per_Bene'] = df.groupby('BeneID')['BeneID'].transform('count')

df['TimeInHptal'] = (df['DischargeDt'] - df['AdmissionDt']).dt.days.fillna(0).clip(lower=0)

df['Provider_Claim_Frequency'] = df.groupby('Provider')['Provider'].transform('count')

chronic_cols = [
    'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
    'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
    'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
    'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke'
]
df[chronic_cols] = df[chronic_cols].replace(2, 0).fillna(0).astype(int)

df['ChronicCond_stroke_Yes'] = df['ChronicCond_stroke']  

df['DeductibleAmtPaid'] = df['IPAnnualDeductibleAmt'] + df['OPAnnualDeductibleAmt']

df['OPD_Flag_Yes'] = (df['AdmissionDt'].isna()).astype(int)

diag_cols = [f'ClmDiagnosisCode_{i}' for i in range(1, 11)]
df['Diagnosis_Count'] = df[diag_cols].notna().sum(axis=1)

df['ChronicDisease_Count'] = df[chronic_cols].sum(axis=1)  

df['Age'] = ((df['ClaimStartDt'] - df['DOB']).dt.days / 365.25).astype(int)

df['Age'] = df['Age'].clip(lower=0, upper=120)

feature_columns = [
    'Total_Claims_Per_Bene',
    'TimeInHptal',
    'Provider_Claim_Frequency',
    'ChronicCond_stroke_Yes',
    'DeductibleAmtPaid',
    'NoOfMonths_PartBCov',
    'NoOfMonths_PartACov',
    'OPD_Flag_Yes',
    'Diagnosis_Count',
    'ChronicDisease_Count',
    'Age'
]

clean_df = df[feature_columns + ['PotentialFraud']].copy()

clean_df.dropna(inplace=True)

clean_df.reset_index(drop=True, inplace=True)
output_file = 'fina_claimmodel.csv'
clean_df.to_csv(output_file, index=False)


🚀 Loading training_data.csv...
Loaded 558211 rows.
🔧 Deriving features...

✅ Success! Extracted and saved 558211 rows.
📁 Output saved to '1claim_level_features_with_label1.csv'

📋 First 5 rows of the final dataset:
   Total_Claims_Per_Bene  TimeInHptal  Provider_Claim_Frequency  \
0                      3          4.0                        25   
1                      6          2.0                        25   
2                      1          5.0                        25   
3                     11         14.0                        25   
4                      5          0.0                        25   

   ChronicCond_stroke_Yes  DeductibleAmtPaid  NoOfMonths_PartBCov  \
0                       1               1208                   12   
1                       0               3036                   12   
2                       0               1068                   12   
3                       0               4342                   12   
4                       1            