In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter

df = pd.read_excel(r'C:\Users\ashwi\GUVI_Projects\Job\Tensaw\Ass\AR_performance_review_synthetic.xlsx')

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
df['denial_reason'] = df['denial_reason'].fillna("Not Denied")
df['denied'] = (df['denial_reason'] != "Not Denied").astype(int)

label_encoders = {}
categorical_cols = ['cpt_code', 'insurance_company', 'physician_name']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df['payment_amount'] = pd.to_numeric(df['payment_amount'], errors='coerce').fillna(0)
df['balance'] = pd.to_numeric(df['balance'], errors='coerce').fillna(0)

features = ['cpt_code', 'insurance_company', 'physician_name', 'payment_amount', 'balance']
X = df[features]
y_binary = df['denied']
y_multiclass = df[df['denied'] == 1]['denial_reason'] 
X_multiclass = df[df['denied'] == 1][features]

X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X, y_binary, test_size=0.2, random_state=42)
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multiclass, y_multiclass, test_size=0.2, random_state=42)
clf_bin = RandomForestClassifier(random_state=42, class_weight="balanced")
clf_bin.fit(X_train_bin, y_train_bin)
y_pred_bin = clf_bin.predict(X_test_bin)

print("=== Denied or Not (Binary) ===")
print("Accuracy:", accuracy_score(y_test_bin, y_pred_bin))
print(classification_report(y_test_bin, y_pred_bin))

le_reason = LabelEncoder()
y_train_multi_enc = le_reason.fit_transform(y_train_multi)
y_test_multi_enc = le_reason.transform(y_test_multi)

clf_multi = RandomForestClassifier(random_state=42, class_weight="balanced")
clf_multi.fit(X_train_multi, y_train_multi_enc)
y_pred_multi = clf_multi.predict(X_test_multi)

print("\n=== Denial Reason (Multiclass) ===")
print("Accuracy:", accuracy_score(y_test_multi_enc, y_pred_multi))
print(classification_report(y_test_multi_enc, y_pred_multi, target_names=le_reason.classes_))


=== Denied or Not (Binary) ===
Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.93      0.94      0.94        69
           1       0.87      0.84      0.85        31

    accuracy                           0.91       100
   macro avg       0.90      0.89      0.89       100
weighted avg       0.91      0.91      0.91       100


=== Denial Reason (Multiclass) ===
Accuracy: 0.8
                                  precision    recall  f1-score   support

        16 - Missing information       0.00      0.00      0.00         6
45 - Charge exceeds fee schedule       0.58      1.00      0.74         7
        96 - Non-covered service       0.94      1.00      0.97        17

                        accuracy                           0.80        30
                       macro avg       0.51      0.67      0.57        30
                    weighted avg       0.67      0.80      0.72        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
