In [None]:
# Placeholder cell; create df before running training



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

# df = ... load from Delta/CSV
# Example: df = preview.copy()

assert 'fraud_flag' in df.columns

y = df['fraud_flag'].astype(int)
X = df.drop(columns=['fraud_flag','fraud_type','claim_id','member_id','provider_id','narrative'])
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

pre = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)], remainder='passthrough')
clf = LogisticRegression(max_iter=200, class_weight='balanced')
pipe = Pipeline([('pre', pre), ('clf', clf)])

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipe.fit(Xtr, ytr)
probs = pipe.predict_proba(Xte)[:,1]
print('ROC-AUC', roc_auc_score(yte, probs))
print('PR-AUC', average_precision_score(yte, probs))


In [None]:
# Optional: load from Delta instead of local df (run on Databricks)
try:
    spark
except NameError:
    spark = None

if spark is not None:
    delta_df = spark.sql("SELECT * FROM aethergen.healthcare_synth_v1")
    import pandas as pd
    df = delta_df.toPandas()
    print('Loaded from Delta:', len(df))
else:
    print('Spark not available; using existing df variable.')
