In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, average_precision_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("../notebook/final_data.csv")
df.head()

Unnamed: 0,Provider,In_TotalClaims,In_AvgDiagnosisCount,In_AvgProcedureCount,In_PctFullDiagnosis,In_PctHighProcedure,In_AvgProcDiagRatio,Out_TotalClaims,Out_AvgDiagnosisCount,Out_AvgProcedureCount,Out_PctFullDiagnosis,Out_PctHasProcedure,Out_AvgProcDiagRatio,PotentialFraud,Bene_UniqueCount,Bene_AvgAge,Bene_AvgChronicCount,Bene_PctChronic,Bene_AvgReimbursed,Bene_AvgDeductible
0,PRV51001,5.0,7.2,0.6,0.0,0.0,0.068571,20.0,9.0,0.0,0.0,0.0,0.0,0,5.0,79.2,16.0,1.0,79252.0,2799.2
1,PRV51003,62.0,8.112903,0.774194,0.064516,0.0,0.093941,70.0,8.444444,0.0,0.014286,0.0,0.0,1,53.0,69.283019,17.320755,1.0,13676.792453,2058.339623
2,PRV51004,0.0,0.0,0.0,0.0,0.0,0.0,149.0,7.333333,0.0,0.006711,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,PRV51005,0.0,0.0,0.0,0.0,0.0,0.0,1165.0,8.060976,0.0,0.007725,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0
4,PRV51007,3.0,7.333333,0.333333,0.0,0.0,0.041667,69.0,9.5,0.0,0.028986,0.0,0.0,0,3.0,79.333333,16.333333,1.0,14123.333333,2606.0


In [4]:
df.columns

Index(['Provider', 'In_TotalClaims', 'In_AvgDiagnosisCount',
       'In_AvgProcedureCount', 'In_PctFullDiagnosis', 'In_PctHighProcedure',
       'In_AvgProcDiagRatio', 'Out_TotalClaims', 'Out_AvgDiagnosisCount',
       'Out_AvgProcedureCount', 'Out_PctFullDiagnosis', 'Out_PctHasProcedure',
       'Out_AvgProcDiagRatio', 'PotentialFraud', 'Bene_UniqueCount',
       'Bene_AvgAge', 'Bene_AvgChronicCount', 'Bene_PctChronic',
       'Bene_AvgReimbursed', 'Bene_AvgDeductible'],
      dtype='object')

In [5]:
X = df.drop(columns=['Provider', 'PotentialFraud'])
y = df['PotentialFraud']

In [6]:
X.shape

(5410, 18)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(
        class_weight = 'balanced',
        max_iter = 1000,
        random_state = 42
    ))
])

In [9]:
pipe_lr.fit(X_train, y_train)

In [10]:
y_pred = pipe_lr.predict(X_test)

In [11]:
y_prob = pipe_lr.predict(X_test)
y_prob

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [12]:
y_prob = pipe_lr.predict_proba(X_test)[:, 1]
y_prob

array([0.07486461, 0.05152333, 0.06547792, ..., 0.06734694, 0.25026022,
       0.419932  ])

In [13]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.87      0.92       977
           1       0.41      0.82      0.54       105

    accuracy                           0.87      1082
   macro avg       0.69      0.85      0.73      1082
weighted avg       0.92      0.87      0.89      1082



In [14]:
"ROC-AUC:", roc_auc_score(y_test, y_prob)

('ROC-AUC:', 0.9236486815811278)

In [15]:
average_precision_score(y_test, y_prob)

0.6595271126829213

In [16]:
y.value_counts(normalize=True)

PotentialFraud
0    0.90647
1    0.09353
Name: proportion, dtype: float64

# Training model on RANDOM FOREST algorithm

In [17]:
pipe_rf = Pipeline([
    ('rf', RandomForestClassifier(
        n_estimators = 500,
        min_samples_leaf = 5,
        class_weight = 'balanced',
        random_state = 42,
        n_jobs = -1
    ))
])

In [18]:
pipe_rf.fit(X_train, y_train)

In [19]:
y_pred_rf = pipe_rf.predict(X_test)

In [20]:
y_prob_rf = pipe_rf.predict_proba(X_test)[:,1]
y_prob_rf

array([0.00652487, 0.00439634, 0.01464166, ..., 0.01478733, 0.3179243 ,
       0.20523795])

In [21]:
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       977
           1       0.50      0.60      0.55       105

    accuracy                           0.90      1082
   macro avg       0.73      0.77      0.75      1082
weighted avg       0.91      0.90      0.91      1082



In [22]:
"ROC-AUC:", roc_auc_score(y_test, y_prob_rf)

('ROC-AUC:', 0.9179558414972949)

In [23]:
threshold = 0.4
y_pred_rf_tuned = (y_prob_rf >= threshold).astype(int)
print(classification_report(y_test, y_pred_rf_tuned))


              precision    recall  f1-score   support

           0       0.97      0.91      0.94       977
           1       0.46      0.72      0.56       105

    accuracy                           0.89      1082
   macro avg       0.71      0.82      0.75      1082
weighted avg       0.92      0.89      0.90      1082



In [24]:
import joblib

In [25]:
joblib.dump(pipe_rf, "fraud_model_rf.pkl")

['fraud_model_rf.pkl']