In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [4]:
df = pd.read_csv("provider_features_final.csv")
df.head()


Unnamed: 0,ProviderID,INP_LOS_mean,INP_LOS_max,INP_LOS_min,INP_InscClaimAmtReimbursed_mean,INP_InscClaimAmtReimbursed_sum,INP_ClaimID_count,OUT_InscClaimAmtReimbursed_mean,OUT_InscClaimAmtReimbursed_sum,OUT_ClaimID_count,PotentialFraud
0,PRV51001,5.0,14.0,0.0,19400.0,97000.0,5.0,382.0,7640.0,20.0,No
1,PRV51003,5.16129,27.0,1.0,9241.935484,573000.0,62.0,466.714286,32670.0,70.0,Yes
2,PRV51004,0.0,0.0,0.0,0.0,0.0,0.0,350.134228,52170.0,149.0,No
3,PRV51005,0.0,0.0,0.0,0.0,0.0,0.0,241.124464,280910.0,1165.0,Yes
4,PRV51007,5.333333,7.0,4.0,6333.333333,19000.0,3.0,213.188406,14710.0,69.0,No


In [5]:
le = LabelEncoder()
df["PotentialFraud"] = le.fit_transform(df["PotentialFraud"])


In [6]:
X = df.drop(["ProviderID", "PotentialFraud"], axis=1)
y = df["PotentialFraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

logreg_pred = logreg.predict(X_test_scaled)


In [9]:
print("Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, logreg_pred))
print("Precision:", precision_score(y_test, logreg_pred))
print("Recall:", recall_score(y_test, logreg_pred))
print("F1 Score:", f1_score(y_test, logreg_pred))


Logistic Regression Results
Accuracy: 0.9453067257945307
Precision: 0.7789473684210526
Recall: 0.5826771653543307
F1 Score: 0.6666666666666666


In [10]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)


In [11]:
print("Random Forest Results")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Precision:", precision_score(y_test, rf_pred))
print("Recall:", recall_score(y_test, rf_pred))
print("F1 Score:", f1_score(y_test, rf_pred))


Random Forest Results
Accuracy: 0.9408721359940873
Precision: 0.688
Recall: 0.6771653543307087
F1 Score: 0.6825396825396826


In [12]:
import joblib
joblib.dump(rf, "fraud_model_rf.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [13]:
# test cell 

print("===== MODEL TEST BLOCK =====")

# 1) Shapes check
print("\nDataset Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# 2) Make sure models exist
print("\nModels Loaded:")
print("Logistic Regression:", type(logreg))
print("Random Forest:", type(rf))

# 3) Single prediction test
print("\nSingle Prediction Test:")
sample = X_test.iloc[:1]
sample_scaled = scaler.transform(sample)

print("LogReg Prediction:", logreg.predict(sample_scaled)[0])
print("RandomForest Prediction:", rf.predict(sample)[0])

# 4) Probability test
print("\nRandomForest Probabilities:")
print(rf.predict_proba(sample))

# 5) Metrics summary
print("\nLogistic Regression Metrics:")
print("Accuracy:", accuracy_score(y_test, logreg_pred))
print("Precision:", precision_score(y_test, logreg_pred))
print("Recall:", recall_score(y_test, logreg_pred))
print("F1 Score:", f1_score(y_test, logreg_pred))

print("\nRandom Forest Metrics:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Precision:", precision_score(y_test, rf_pred))
print("Recall:", recall_score(y_test, rf_pred))
print("F1 Score:", f1_score(y_test, rf_pred))

# 6) Feature importance check
print("\nFeature Importance (Top 10):")
feat_imp = pd.Series(rf.feature_importances_, index=X.columns)
print(feat_imp.sort_values(ascending=False).head(10))


===== MODEL TEST BLOCK =====

Dataset Shapes:
X_train: (4057, 9)
X_test: (1353, 9)
y_train: (4057,)
y_test: (1353,)

Models Loaded:
Logistic Regression: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Random Forest: <class 'sklearn.ensemble._forest.RandomForestClassifier'>

Single Prediction Test:
LogReg Prediction: 0
RandomForest Prediction: 0

RandomForest Probabilities:
[[0.5302132 0.4697868]]

Logistic Regression Metrics:
Accuracy: 0.9453067257945307
Precision: 0.7789473684210526
Recall: 0.5826771653543307
F1 Score: 0.6666666666666666

Random Forest Metrics:
Accuracy: 0.9408721359940873
Precision: 0.688
Recall: 0.6771653543307087
F1 Score: 0.6825396825396826

Feature Importance (Top 10):
INP_LOS_max                        0.275407
INP_InscClaimAmtReimbursed_sum     0.170700
OUT_ClaimID_count                  0.117970
OUT_InscClaimAmtReimbursed_sum     0.109506
INP_LOS_mean                       0.089775
INP_ClaimID_count                  0.088604
INP_InscClaimAmtReimbur