In [1]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import pickle

# Check current working directory
print("Current Working Directory:", os.getcwd())

# Load the cleaned dataset
df = pd.read_csv("../data/cleaned_transactions.csv")

# Standardize fraud label
df = df.rename(columns={"Class": "Is_Fraud"})

# Prepare features and target
X = df.drop(columns=["Is_Fraud"])
y = df["Is_Fraud"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter grid for tuning
params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False]
}

# Initialize model and GridSearchCV
model = RandomForestClassifier()
grid = GridSearchCV(model, params, cv=5, scoring="recall")

# Fit the model
grid.fit(X_train, y_train)

# Ensure output directory exists
os.makedirs("../src/utils", exist_ok=True)

# Save the best model to disk
model_path = "../src/utils/model_pipeline.pkl"
with open(model_path, "wb") as f:
    pickle.dump(grid.best_estimator_, f)

print("Best Parameters:", grid.best_params_)
print("Best Recall Score:", grid.best_score_)
print(f"Model saved to '{model_path}'")

Current Working Directory: /Users/mumakalobwe/Desktop/Projects/fraudsight-risk-analyzer/jupyter_notebooks


KeyError: "['Is_Fraud'] not found in axis"