In [2]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# 1. GENERATE SYNTHETIC DATA
# We are simulating 5000 bills to teach the AI what fraud looks like.
print("âš¡ Generating synthetic data...")
np.random.seed(42)
n_samples = 5000

# --- Genuine Bills (Safe) ---
# Low tampering score, normal amounts, lots of medical words
gen_cnn = np.random.uniform(0.0, 0.3, n_samples)
gen_meta = np.random.choice([0, 1], n_samples, p=[0.95, 0.05])
gen_amt = np.random.normal(1.0, 0.05, n_samples)
gen_nlp = np.random.randint(5, 15, n_samples)
genuine_data = np.column_stack((gen_cnn, gen_meta, gen_amt, gen_nlp))
genuine_labels = np.zeros(n_samples) # Label 0 = Real

# --- Fraud Bills (Fake) ---
# High tampering score, amount mismatch, few medical words
fake_cnn = np.random.uniform(0.6, 1.0, n_samples)
fake_meta = np.random.choice([0, 1], n_samples, p=[0.4, 0.6])
fake_amt = np.random.uniform(1.5, 5.0, n_samples)
fake_nlp = np.random.randint(0, 5, n_samples)
fake_data = np.column_stack((fake_cnn, fake_meta, fake_amt, fake_nlp))
fake_labels = np.ones(n_samples) # Label 1 = Fraud

# Combine them
X = np.vstack((genuine_data, fake_data))
y = np.hstack((genuine_labels, fake_labels))

# 2. TRAIN THE MODEL
print("ðŸ§  Training the Random Forest model...")
# Split into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Scale the numbers so the AI understands them better
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# The AI Brain
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(X_train_scaled, y_train)

# 3. TEST THE RESULTS
print("\n--- Model Report ---")
preds = model.predict(X_test_scaled)
print(classification_report(y_test, preds))

# 4. SAVE THE MODEL
# We need to save this to the 'backend/models' folder
current_dir = os.getcwd()
# If we are in 'notebooks' folder, go up one level, then into backend/models
save_path = os.path.abspath(os.path.join(current_dir, "..", "backend", "models", "ml_model.pkl"))

# Fix for VS Code execution path variations
if "backend" not in save_path:
     save_path = os.path.join(current_dir, "backend", "models", "ml_model.pkl")

os.makedirs(os.path.dirname(save_path), exist_ok=True)

with open(save_path, "wb") as f:
    pickle.dump({'model': model, 'scaler': scaler}, f)
    
print(f"âœ… SUCCESS! Model saved to: {save_path}")
print("ðŸš€ Now restart your backend terminal to use this new brain.")

âš¡ Generating synthetic data...
ðŸ§  Training the Random Forest model...

--- Model Report ---
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1021
         1.0       1.00      1.00      1.00       979

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

âœ… SUCCESS! Model saved to: d:\Desktop\insurance-claim-checker\backend\backend\models\ml_model.pkl
ðŸš€ Now restart your backend terminal to use this new brain.


In [1]:
%pip install pandas numpy matplotlib seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
