In [1]:
# =========================
# 1. Import Libraries
# =========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# =========================
# 2. Load Dataset
# =========================
df = pd.read_csv("balanced_fraud_dataset.csv")
df=df.sample(500)
df.to_csv("data.csv",index=False)

# =========================
# 3. Basic Inspection
# =========================
print(df.shape)
print(df.info())
print(df.isnull().sum())

# =========================
# 4. Drop Unnecessary Columns
# (IDs that don't help model)
# =========================
df = df.drop(["nameOrig", "nameDest"], axis=1)

# =========================
# 5. Feature Engineering
# =========================
# Balance difference features
df["orgBalanceDiff"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["destBalanceDiff"] = df["newbalanceDest"] - df["oldbalanceDest"]

# =========================
# 6. Encode Categorical Feature
# =========================
le = LabelEncoder()
df["type"] = le.fit_transform(df["type"])

# =========================
# 7. Define Features and Target
# =========================
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

# =========================
# 8. Train-Test Split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# =========================
# 9. Feature Scaling
# =========================
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training data shape:", X_train_scaled.shape)
print("Testing data shape:", X_test_scaled.shape)

(500, 11)
<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 11661 to 12805
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            500 non-null    int64  
 1   type            500 non-null    object 
 2   amount          500 non-null    float64
 3   nameOrig        500 non-null    object 
 4   oldbalanceOrg   500 non-null    float64
 5   newbalanceOrig  500 non-null    float64
 6   nameDest        500 non-null    object 
 7   oldbalanceDest  500 non-null    float64
 8   newbalanceDest  500 non-null    float64
 9   isFraud         500 non-null    int64  
 10  isFlaggedFraud  500 non-null    int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 46.9+ KB
None
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int6

In [2]:
import mlflow
import mlflow.sklearn
import dagshub

mlflow.set_tracking_uri('https://dagshub.com/ArchitSaki/Fraud-Detection-System--End-to-end-ml-project-.mlflow')
dagshub.init(repo_owner='ArchitSaki', repo_name='Fraud-Detection-System--End-to-end-ml-project-', mlflow=True)



In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = RandomForestClassifier()

# Hyperparameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Grid Search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring="f1_weighted"
)

mlflow.set_experiment("Hyperparameter_Tuning")

with mlflow.start_run(run_name="RandomForest_GridSearch"):

    # Train GridSearch
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    # Predictions
    y_pred = best_model.predict(X_test)

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log best parameters
    mlflow.log_params(grid_search.best_params_)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Log model
    mlflow.sklearn.log_model(best_model, "best_model")

    print("Best Parameters:", grid_search.best_params_)



Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
üèÉ View run RandomForest_GridSearch at: https://dagshub.com/ArchitSaki/Fraud-Detection-System--End-to-end-ml-project-.mlflow/#/experiments/2/runs/bbb8d6d70cd14afc9c3a2e834452da8e
üß™ View experiment at: https://dagshub.com/ArchitSaki/Fraud-Detection-System--End-to-end-ml-project-.mlflow/#/experiments/2
