In [None]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import pickle

# Checking wokring directory for debugging
print("Current Working Directory:", os.getcwd())

# Load the dataset
df = pd.read_csv("../../data/cleaned_transactions.csv")

# Split data
X = df.drop(columns=["Is_Fraud"])
y = df["Is_Fraud"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False]
}

model = RandomForestClassifier()
grid = GridSearchCV(model, params, cv=5, scoring="recall")
grid.fit(X_train, y_train)

# Save model
os.makedirs("../../src/utils", exist_ok=True)
with open("../../src/utils/model_pipeline.pkl", "wb") as f:
    pickle.dump(grid.best_estimator_, f)

print("Best Parameters:", grid.best_params_)
print("Best Recall Score:", grid.best_score_)
print("Model saved to '../../src/utils/model_pipeline.pkl'")