In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('claims_cleaned_data.csv')
df.head()

Unnamed: 0,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,DeductibleAmtPaid,Provider,InscClaimAmtReimbursed,Gender,RenalDiseaseIndicator,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,...,46,47,48,49,50,51,0.1,1.1,2.1,3.1
0,-0.465198,-0.484453,0.150719,1543,26000,1,0,12,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.465198,-0.484453,0.150719,1540,5000,1,0,12,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.465198,-0.484453,0.150719,1585,5000,1,0,12,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.417088,-0.220773,0.150719,540,5000,0,0,12,12,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.450005,-0.452812,0.150719,1753,10000,0,1,12,12,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [4]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [5]:
top_features = ['Provider', 'InscClaimAmtReimbursed', 'IPAnnualReimbursementAmt',
                'IPAnnualDeductibleAmt', 'TotalReimbursement', 'RenalDiseaseIndicator',
                'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
                'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression', 
                'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis', 
                'ChronicCond_rheumatoidarthritis','ChronicCond_stroke']
  

In [6]:
# Prepare data using only the selected top features
X = df[top_features]
y = df['PotentialFraud']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [10, 50],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)

# Use GridSearchCV for hyperparameter tuning
grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, cv=5, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search_gb.fit(X_train_scaled, y_train)

# Best parameters and score
print(f"Best Parameters for Gradient Boosting: {grid_search_gb.best_params_}")
print(f"Best ROC AUC for Gradient Boosting: {grid_search_gb.best_score_}")

# Retrain with best parameters
best_gb = grid_search_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test_scaled)
print(classification_report(y_test, y_pred_gb))

Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for Gradient Boosting: {'learning_rate': 0.2, 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Best ROC AUC for Gradient Boosting: 0.9847888636592621
              precision    recall  f1-score   support

           0       0.96      0.91      0.93      3414
           1       0.93      0.97      0.95      4681

    accuracy                           0.95      8095
   macro avg       0.95      0.94      0.94      8095
weighted avg       0.95      0.95      0.94      8095



In [8]:
import joblib

# Assuming 'model' is your trained model
joblib.dump(grid_search_gb, 'claims_fraud_detection.pkl')


['claims_fraud_detection.pkl']