In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
import random

# Parameters
n_samples = 10000
fraud_ratio = 0.1

# Generate core features and fraud labels
X, y = make_classification(n_samples=n_samples, 
                           n_features=6, 
                           n_informative=4, 
                           weights=[1 - fraud_ratio, fraud_ratio], 
                           flip_y=0.01, 
                           random_state=42)

df = pd.DataFrame(X, columns=['cost_feature_1', 'util_feature_2', 'usage_score', 'billing_variance',
                              'frequency_of_visits', 'anomaly_score'])

# Add realistic features
np.random.seed(42)

# Patient and Provider Info
df['patient_age'] = np.random.randint(18, 90, size=n_samples)
df['gender'] = np.random.choice(['Male', 'Female', 'Other'], size=n_samples)
df['provider_id'] = np.random.randint(1000, 9999, size=n_samples)
df['hospital_id'] = np.random.randint(100, 500, size=n_samples)

# Claim Info
df['claim_amount'] = np.abs(np.random.normal(loc=8000, scale=4000, size=n_samples)).round(2)
df['num_procedures'] = np.random.poisson(lam=3, size=n_samples)
df['hospital_stay_days'] = np.random.randint(1, 20, size=n_samples)
df['primary_diagnosis'] = np.random.choice(['Diabetes', 'Heart Disease', 'Fracture', 'Cancer', 'Flu', 'Migraine'], size=n_samples)
df['procedure_type'] = np.random.choice(['Surgery', 'Therapy', 'Radiology', 'Consultation', 'Emergency'], size=n_samples)

# Temporal Info
df['claim_day'] = np.random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'], size=n_samples)
df['claim_month'] = np.random.choice(range(1, 13), size=n_samples)

# Target Label
df['is_fraud'] = y

# Preview and save
print(df.head())
df.to_csv('enhanced_medical_fraud034         -1.479911   

   frequency_of_visits  anomaly_score  patient_age gender  provider_id  \
0             1.232588      -0.665552           69   Male         1264   
1            -0.587576       1.496148           32  Other         4093   
2             0.828609      -2.407889           89  Other         5798   
3            -1.237884       0.265186           78   Male         9143   
4             0.596953      -1.028998           38   Male         1325   

   hospital_id  claim_amount  num_procedures  hospital_stay_days  \
0          171       9066.21               4                   2   _dataset.csv', index=False)
print("✅ Dataset saved as 'enhanced_medical_fraud_dataset.csv'")


   cost_feature_1  util_feature_2  usage_score  billing_variance  \
0       -2.591459        1.264459     0.577366         -2.001795   
1       -0.120761       -0.725202    -0.673109          0.361122   
2        1.052110       -1.050024    -1.097380         -0.235955   
3       -0.538686       -2.239094    -1.158789         -1.860463   
4       -1.163512        0.247373     0.139034         -1.479911   

   frequency_of_visits  anomaly_score  patient_age gender  provider_id  \
0             1.232588      -0.665552           69   Male         1264   
1            -0.587576       1.496148           32  Other         4093   
2             0.828609      -2.407889           89  Other         5798   
3            -1.237884       0.265186           78   Male         9143   
4             0.596953      -1.028998           38   Male         1325   

   hospital_id  claim_amount  num_procedures  hospital_stay_days  \
0          171       9066.21               4                   2   
1         

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("enhanced_medical_fraud_dataset.csv")

# Step 1: Label Encoding for Categorical Variables
categorical_cols = ['gender', 'primary_diagnosis', 'procedure_type', 'claim_day']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # store encoder for future use

# Step 2: Features and Labels
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

# Step 3: Split and balance with SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Step 4: Train model
model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train_bal, y_train_bal)

# Step 5: Save model and encoders to pickle
with open("xgboost_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("label_encoders.pkl", "wb") as le_file:
    pickle.dump(label_encoders, le_file)

print("✅ Model and label encoders saved successfully.")

# Optional: Test model
y_pred = model.predict(X_test)
print("📊 Classification Report:\n", classification_report(y_test, y_pred))


✅ Model and label encoders saved successfully.
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97      2685
           1       0.70      0.76      0.73       315

    accuracy                           0.94      3000
   macro avg       0.83      0.86      0.85      3000
weighted avg       0.94      0.94      0.94      3000



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Load Dataset
df = pd.read_csv("enhanced_medical_fraud_dataset.csv")
print("📥 Loaded Data:")
print(df.head())

# Step 2: Label Encoding
categorical_cols = ['gender', 'primary_diagnosis', 'procedure_type', 'claim_day']
label_encoders = {}

print("\n🔤 Encoding Categorical Features:")
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f" - {col} classes: {list(le.classes_)}")

# Step 3: Split Features and Target
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]
print("\n📊 Feature Matrix Shape:", X.shape)
print("🎯 Target Distribution:\n", y.value_counts())

# Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
print("\n🧪 Training set size:", X_train.shape)
print("🧪 Test set size:", X_test.shape)

# Step 5: Apply SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("\n⚖️ After SMOTE Balancing:")
print(" - X_train_bal shape:", X_train_bal.shape)
print(" - Class distribution:\n", pd.Series(y_train_bal).value_counts())

# Step 6: Train XGBoost
model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train_bal, y_train_bal)
print("\n🧠 Model Trained Successfully")

# Step 7: Predictions and Evaluation
y_pred = model.predict(X_test)

print("\n📈 Classification Report:")
print(classification_report(y_test, y_pred))

print("🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 8: Save Model and Encoders
with open("xgboost_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
print("\n✅ XGBoost model saved as 'xgboost_model.pkl'")

with open("label_encoders.pkl", "wb") as le_file:
    pickle.dump(label_encoders, le_file)
print("✅ Label encoders saved as 'label_encoders.pkl'")


📥 Loaded Data:
   cost_feature_1  util_feature_2  usage_score  billing_variance  \
0       -2.591459        1.264459     0.577366         -2.001795   
1       -0.120761       -0.725202    -0.673109          0.361122   
2        1.052110       -1.050024    -1.097380         -0.235955   
3       -0.538686       -2.239094    -1.158789         -1.860463   
4       -1.163512        0.247373     0.139034         -1.479911   

   frequency_of_visits  anomaly_score  patient_age gender  provider_id  \
0             1.232588      -0.665552           69   Male         1264   
1            -0.587576       1.496148           32  Other         4093   
2             0.828609      -2.407889           89  Other         5798   
3            -1.237884       0.265186           78   Male         9143   
4             0.596953      -1.028998           38   Male         1325   

   hospital_id  claim_amount  num_procedures  hospital_stay_days  \
0          171       9066.21               4                   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [4]:
import pandas as pd
import numpy as np
import pickle

# Step 1: Load the trained model and label encoders
with open("xgboost_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("label_encoders.pkl", "rb") as le_file:
    label_encoders = pickle.load(le_file)

# Step 2: Prepare new data for prediction
# Let's assume new_data is a DataFrame for which we want predictions
new_data = pd.DataFrame({
    'gender': ['Male'], 
    'primary_diagnosis': ['Cancer'], 
    'procedure_type': ['Surgery'],
    'claim_day': ['Monday'],
    'patient_age': [45], 
    'claim_amount': [12000],
    'num_procedures': [3], 
    'hospital_stay_days': [7],
    'cost_feature_1': [0.3], 'util_feature_2': [0.2], 'usage_score': [0.5],
    'billing_variance': [0.1], 'frequency_of_visits': [0.5], 'anomaly_score': [0.3]
})

print("\n📊 New Data (Before Encoding):")
print(new_data)

# Step 3: Apply label encoding to categorical columns
categorical_cols = ['gender', 'primary_diagnosis', 'procedure_type', 'claim_day']
for col in categorical_cols:
    le = label_encoders[col]
    new_data[col] = le.transform(new_data[col])

print("\n🔤 New Data (After Encoding):")
print(new_data)

# Step 4: Make prediction
X_new = new_data  # Features for prediction
predictions = model.predict(X_new)

# Step 5: Display the prediction result
print("\n💡 Predictions:")
print(" - Fraud (1) or Legit (0):", predictions)

# If you want the prediction to be in readable form
if predictions[0] == 1:
    print("⚠️ Fraud detected!")
else:
    print("✅ No fraud detected.")



📊 New Data (Before Encoding):
  gender primary_diagnosis procedure_type claim_day  patient_age  \
0   Male            Cancer        Surgery    Monday           45   

   claim_amount  num_procedures  hospital_stay_days  cost_feature_1  \
0         12000               3                   7             0.3   

   util_feature_2  usage_score  billing_variance  frequency_of_visits  \
0             0.2          0.5               0.1                  0.5   

   anomaly_score  
0            0.3  

🔤 New Data (After Encoding):
   gender  primary_diagnosis  procedure_type  claim_day  patient_age  \
0       1                  0               3          1           45   

   claim_amount  num_procedures  hospital_stay_days  cost_feature_1  \
0         12000               3                   7             0.3   

   util_feature_2  usage_score  billing_variance  frequency_of_visits  \
0             0.2          0.5               0.1                  0.5   

   anomaly_score  
0            0.3  


ValueError: feature_names mismatch: ['cost_feature_1', 'util_feature_2', 'usage_score', 'billing_variance', 'frequency_of_visits', 'anomaly_score', 'patient_age', 'gender', 'provider_id', 'hospital_id', 'claim_amount', 'num_procedures', 'hospital_stay_days', 'primary_diagnosis', 'procedure_type', 'claim_day', 'claim_month'] ['gender', 'primary_diagnosis', 'procedure_type', 'claim_day', 'patient_age', 'claim_amount', 'num_procedures', 'hospital_stay_days', 'cost_feature_1', 'util_feature_2', 'usage_score', 'billing_variance', 'frequency_of_visits', 'anomaly_score']
expected hospital_id, provider_id, claim_month in input data

In [5]:
new_data = pd.DataFrame({
    'gender': ['Male'], 
    'primary_diagnosis': ['Cancer'], 
    'procedure_type': ['Surgery'],
    'claim_day': ['Monday'],
    'patient_age': [45], 
    'claim_amount': [12000],
    'num_procedures': [3], 
    'hospital_stay_days': [7],
    'cost_feature_1': [0.3], 'util_feature_2': [0.2], 'usage_score': [0.5],
    'billing_variance': [0.1], 'frequency_of_visits': [0.5], 'anomaly_score': [0.3],
    
    # These columns were missing earlier but are essential
    'hospital_id': [200],
    'provider_id': [4000],
    'claim_month': [5]  # Add month of the claim
})


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Define number of samples
n_samples = 1000

# Define features and their random ranges
data = {
    'gender': np.random.choice(['Male', 'Female'], n_samples),
    'primary_diagnosis': np.random.choice(['Cancer', 'Heart Disease', 'Diabetes', 'Other'], n_samples),
    'procedure_type': np.random.choice(['Surgery', 'Consultation', 'Test', 'Emergency'], n_samples),
    'claim_day': np.random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], n_samples),
    'patient_age': np.random.randint(18, 80, n_samples),
    'claim_amount': np.random.uniform(1000, 20000, n_samples),
    'num_procedures': np.random.randint(1, 6, n_samples),
    'hospital_stay_days': np.random.randint(1, 14, n_samples),
    'cost_feature_1': np.random.uniform(0, 1, n_samples),
    'util_feature_2': np.random.uniform(0, 1, n_samples),
    'usage_score': np.random.uniform(0, 1, n_samples),
    'billing_variance': np.random.uniform(0, 1, n_samples),
    'frequency_of_visits': np.random.uniform(0, 1, n_samples),
    'anomaly_score': np.random.uniform(0, 1, n_samples),
    'hospital_id': np.random.randint(1, 10, n_samples),
    'provider_id': np.random.randint(1, 10, n_samples),
    'claim_month': np.random.randint(1, 13, n_samples),
    'fraud': np.random.choice([0, 1], n_samples, p=[0.85, 0.15])  # 85% Non-Fraud, 15% Fraud
}

# Create DataFrame
df = pd.DataFrame(data)

# Encode categorical features
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['primary_diagnosis'] = label_encoder.fit_transform(df['primary_diagnosis'])
df['procedure_type'] = label_encoder.fit_transform(df['procedure_type'])
df['claim_day'] = label_encoder.fit_transform(df['claim_day'])

# Save the DataFrame to a CSV file
df.to_csv("health_insurance_synthetic_data.csv", index=False)

print("Synthetic medical fraud dataset generated and saved as 'health_insurance_synthetic_data.csv'.")


Synthetic medical fraud dataset generated and saved as 'health_insurance_synthetic_data.csv'.
