In [None]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
import pickle

# --- Configuration ---
PROJECT_ROOT = "/Users/mumakalobwe/Desktop/Projects/fraudsight-risk-analyzer"
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "cleaned_transactions.csv")
MODEL_DIR = os.path.join(PROJECT_ROOT, "src", "utils")
MODEL_PATH = os.path.join(MODEL_DIR, "model_pipeline.pkl")

# --- Debug: working directory and file checks ---
print("Current working directory:", os.getcwd())
print("Data file exists:", os.path.exists(DATA_PATH))
print("Model directory exists (before):", os.path.exists(MODEL_DIR))
print("Model directory contents (before):", os.listdir(MODEL_DIR) if os.path.exists(MODEL_DIR) else "N/A")

# --- Load and prepare data ---
df = pd.read_csv(DATA_PATH)
df = df.rename(columns={"Class": "Is_Fraud"})

X = df.drop(columns=["Is_Fraud"])
y = df["Is_Fraud"]

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Define pipeline and parameter grid ---
pipeline = Pipeline([
    ("model", RandomForestClassifier(random_state=42))
])

param_grid = {
    "model__n_estimators": [50, 100],
    "model__max_depth": [5, 10],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2],
    "model__max_features": ["sqrt"]
}

grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="recall",
    n_jobs=-1,
    verbose=1
)

# --- Fit model ---
print("Starting model training...")
grid.fit(X_train, y_train)
print("Training complete. Best parameters:", grid.best_params_)
print("Best recall score:", grid.best_score_)

# --- Save model ---
os.makedirs(MODEL_DIR, exist_ok=True)  # Ensure save directory exists
print("Model directory exists (after mkdir):", os.path.exists(MODEL_DIR))

try:
    with open(MODEL_PATH, "wb") as f:
        pickle.dump(grid.best_estimator_, f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Model saved successfully at: {MODEL_PATH}")
    print(f"File size (bytes): {os.path.getsize(MODEL_PATH)}")
except Exception as e:
    print(f"Error saving model: {e}")


Current working directory: /Users/mumakalobwe/Desktop/Projects/fraudsight-risk-analyzer/jupyter_notebooks
Data file exists: True
Model directory exists (before): True
Model directory contents (before): []
Starting model training...
Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [1]:
import pickle
from pathlib import Path

BASE_DIR = Path.cwd().parent  # from jupyter_notebooks â†’ project root
MODEL_PATH = BASE_DIR / "src" / "utils" / "model_pipeline.pkl"

print("Model path:", MODEL_PATH)
print("Exists:", MODEL_PATH.exists())
print("File size:", MODEL_PATH.stat().st_size, "bytes")

with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

print("Loaded model type:", type(model))
print("Has predict:", hasattr(model, "predict"))
print("Has feature_names_in_:", hasattr(model, "feature_names_in_"))

Model path: /Users/mumakalobwe/Desktop/Projects/fraudsight-risk-analyzer/src/utils/model_pipeline.pkl
Exists: True
File size: 515114 bytes
Loaded model type: <class 'sklearn.pipeline.Pipeline'>
Has predict: True
Has feature_names_in_: True


In [2]:
import pandas as pd

# Load a few rows from your cleaned dataset
df = pd.read_csv("../data/cleaned_transactions.csv").head(5)

# Match inference logic
if "Class" in df.columns:
    df = df.drop(columns=["Class"])

df = df[model.feature_names_in_]

preds = model.predict(df)
probs = model.predict_proba(df)[:, 1]

print("Predictions:", preds)
print("Probabilities:", probs)

Predictions: [0 0 0 0 0]
Probabilities: [0.0001437  0.00012463 0.00693927 0.00012164 0.00013394]
