In [4]:
import pandas as pd
import mlflow
import dagshub



In [10]:
# =========================
# 1. Import Libraries
# =========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# =========================
# 2. Load Dataset
# =========================
df = pd.read_csv("balanced_fraud_dataset.csv")
df=df.sample(500)
df.to_csv("data.csv",index=False)

# =========================
# 3. Basic Inspection
# =========================
print(df.shape)
print(df.info())
print(df.isnull().sum())

# =========================
# 4. Drop Unnecessary Columns
# (IDs that don't help model)
# =========================
df = df.drop(["nameOrig", "nameDest"], axis=1)

# =========================
# 5. Feature Engineering
# =========================
# Balance difference features
df["orgBalanceDiff"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["destBalanceDiff"] = df["newbalanceDest"] - df["oldbalanceDest"]

# =========================
# 6. Encode Categorical Feature
# =========================
le = LabelEncoder()
df["type"] = le.fit_transform(df["type"])

# =========================
# 7. Define Features and Target
# =========================
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

# =========================
# 8. Train-Test Split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# =========================
# 9. Feature Scaling
# =========================
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training data shape:", X_train_scaled.shape)
print("Testing data shape:", X_test_scaled.shape)

(500, 11)
<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 2150 to 14937
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            500 non-null    int64  
 1   type            500 non-null    object 
 2   amount          500 non-null    float64
 3   nameOrig        500 non-null    object 
 4   oldbalanceOrg   500 non-null    float64
 5   newbalanceOrig  500 non-null    float64
 6   nameDest        500 non-null    object 
 7   oldbalanceDest  500 non-null    float64
 8   newbalanceDest  500 non-null    float64
 9   isFraud         500 non-null    int64  
 10  isFlaggedFraud  500 non-null    int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 46.9+ KB
None
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [11]:
import dagshub

mlflow.set_tracking_uri('https://dagshub.com/ArchitSaki/Fraud-Detection-System--End-to-end-ml-project-.mlflow')
dagshub.init(repo_owner='ArchitSaki', repo_name='Fraud-Detection-System--End-to-end-ml-project-', mlflow=True)

# mlflow.set_experiment("Logistic Regression Baseline")
mlflow.set_experiment("Logistic Regression Baseline")

2026-02-24 15:28:19,012 - INFO - HTTP Request: GET https://dagshub.com/api/v1/repos/ArchitSaki/Fraud-Detection-System--End-to-end-ml-project- "HTTP/1.1 200 OK"


2026-02-24 15:28:19,038 - INFO - Initialized MLflow to track repo "ArchitSaki/Fraud-Detection-System--End-to-end-ml-project-"


2026-02-24 15:28:19,041 - INFO - Repository ArchitSaki/Fraud-Detection-System--End-to-end-ml-project- initialized!


<Experiment: artifact_location='mlflow-artifacts:/1bfce518139543729ddc15f7019888e2', creation_time=1771924932306, experiment_id='0', last_update_time=1771924932306, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}, workspace='default'>

In [12]:
import mlflow
import logging
import os
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

logging.info("Starting MLflow run...")

with mlflow.start_run():
    start=time.time()
    try:
        logging.info("logging preprocessing parameters")
        mlflow.log_param("test_size", 0.25)
        # mlflow.log_param("")
        logging.info("Initializing Logistic Regression model...")
        model = LogisticRegression(max_iter=1000)

        logging.info("fitting the model")
        model.fit(X_train,y_train)

        logging.info("Model training complete.")

        logging.info("Logging model parameters...")
        mlflow.log_param("model", "Logistic Regression")

        logging.info("Making predictions...")
        y_pred = model.predict(X_test)

        logging.info("Calculating evaluation metrics...")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        logging.info("Logging evaluation metrics...")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        logging.info("Saving and logging the model...")
        mlflow.sklearn.log_model(model, "model")

        # Log execution time
        end_time = time.time()
        logging.info(f"Model training and logging completed in {end_time - start_time:.2f} seconds.")

        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)

2026-02-24 15:28:21,366 - INFO - Starting MLflow run...
2026-02-24 15:28:21,928 - INFO - logging preprocessing parameters
2026-02-24 15:28:22,281 - INFO - Initializing Logistic Regression model...
2026-02-24 15:28:22,282 - INFO - fitting the model
2026-02-24 15:28:22,397 - INFO - Model training complete.
2026-02-24 15:28:22,398 - INFO - Logging model parameters...
2026-02-24 15:28:22,754 - INFO - Making predictions...
2026-02-24 15:28:22,758 - INFO - Calculating evaluation metrics...
2026-02-24 15:28:22,786 - INFO - Logging evaluation metrics...
2026-02-24 15:28:24,205 - INFO - Saving and logging the model...
2026-02-24 15:28:46,430 - ERROR - An error occurred: name 'start_time' is not defined
Traceback (most recent call last):
  File "C:\Users\itsar\AppData\Local\Temp\ipykernel_12268\2646488953.py", line 49, in <module>
    logging.info(f"Model training and logging completed in {end_time - start_time:.2f} seconds.")
NameError: name 'start_time' is not defined


üèÉ View run casual-bee-106 at: https://dagshub.com/ArchitSaki/Fraud-Detection-System--End-to-end-ml-project-.mlflow/#/experiments/0/runs/213474013fac494ea9174491f9646686
üß™ View experiment at: https://dagshub.com/ArchitSaki/Fraud-Detection-System--End-to-end-ml-project-.mlflow/#/experiments/0


In [13]:
df['isFraud'].value_counts()

isFraud
1    254
0    246
Name: count, dtype: int64