In [2]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-2.20.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.1 (from mlflow)
  Downloading mlflow_skinny-2.20.1-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.1->mlflow)
  Downloading databricks_sdk-0.43.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

In [40]:
# 🔹 Import necessary libraries
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
import logging
import joblib

In [41]:
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, LSTM

In [23]:
#  Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
#  Define save directory in Google Drive
MODEL_DIR = "/content/drive/My Drive/models/"
os.makedirs(MODEL_DIR, exist_ok=True)

In [43]:
#  Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


In [44]:
#  Load Data
def load_data():
    fraud_data = pd.read_csv("/content/drive/MyDrive/weak_8_data/Data-20250205T200552Z-001/Data/Preprocessed_Fraud_Data (1).csv")
    credit_data = pd.read_csv("/content/drive/MyDrive/weak_8_data/Data-20250205T200552Z-001/Data/creditcard.csv")
    return fraud_data, credit_data

In [45]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import numpy as np

def prepare_data(df, target_col):
    #  Drop datetime columns (e.g., timestamps)
    if 'timestamp' in df.columns:
        df = df.drop(columns=['timestamp'])

    #  Convert categorical columns to numeric using Label Encoding
    label_encoders = {}
    for col in df.select_dtypes(include=['object']).columns:
        if col != target_col:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    #  Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    #  Normalize numerical features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return train_test_split(X, y, test_size=0.2, random_state=42)


In [46]:
#  Train, Evaluate & Save ML Models
def train_and_evaluate_model(model, model_name, X_train, X_test, y_train, y_test, results):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    model_path = os.path.join(MODEL_DIR, f"{model_name}.pkl")
    joblib.dump(model, model_path)
    logger.info(f"Model saved to {model_path}")

    results.append([model_name, acc, prec, rec, f1])

In [47]:
#  Train ML Models
def train_ml_models(X_train, X_test, y_train, y_test, results):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
    }
    for name, model in models.items():
        train_and_evaluate_model(model, name, X_train, X_test, y_train, y_test, results)

In [48]:
#  Train and Save Deep Learning Models
def train_dl_model(model, model_name, X_train, X_test, y_train, y_test, results):
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=0)

    _, acc = model.evaluate(X_test, y_test, verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    model_path = os.path.join(MODEL_DIR, f"{model_name}.h5")
    model.save(model_path)
    logger.info(f"Model saved to {model_path}")

    results.append([model_name, acc, prec, rec, f1])

In [49]:
#  Full Training Pipeline
def fraud_detection_pipeline():
    fraud_data, credit_data = load_data()
    X_train, X_test, y_train, y_test = prepare_data(fraud_data, "class")

    results = []
    train_ml_models(X_train, X_test, y_train, y_test, results)

    # Prepare data for deep learning models
    fraud_shape = (X_train.shape[1], 1)
    X_train_dl = X_train.reshape(-1, fraud_shape[0], 1)
    X_test_dl = X_test.reshape(-1, fraud_shape[0], 1)

    cnn_model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=fraud_shape),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    train_dl_model(cnn_model, "CNN", X_train_dl, X_test_dl, y_train, y_test, results)

    lstm_model = Sequential([
        LSTM(50, activation='relu', input_shape=fraud_shape),
        Dense(1, activation='sigmoid')
    ])
    lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    train_dl_model(lstm_model, "LSTM", X_train_dl, X_test_dl, y_train, y_test, results)
    # Save results
    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
    results_df = results_df.sort_values(by="F1-Score", ascending=False)
    results_path = os.path.join(MODEL_DIR, "model_performance.csv")
    results_df.to_csv(results_path, index=False)
    logger.info(f"Model evaluation results saved to {results_path}")

    top_2_models = results_df.head(2)
    logger.info("\nTop 2 Best Performing Models:\n")
    logger.info(top_2_models)
    return top_2_models

In [50]:
#  Run the pipeline
if __name__ == "__main__":
    top_models = fraud_detection_pipeline()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


  super().__init__(**kwargs)


[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step


