<a href="https://colab.research.google.com/github/DevDevOpsVaibhavPandey/Notebooks/blob/main/mlflow_00_lr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Handling packages

In [2]:
!unzip -C "/content/ieee-fraud-detection.zip"

Archive:  /content/ieee-fraud-detection.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace test_identity.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace test_transaction.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace train_identity.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace train_transaction.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [3]:
!pip install --upgrade pip setuptools wheel
!pip install -q scikit-learn mlflow dagshub



### Handling imports

In [7]:
# Import Libraries
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import warnings
import mlflow
from mlflow.models import infer_signature

# Suppress warnings
warnings.filterwarnings("ignore")

### Util method for memory management

In [5]:
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type not in ['object', 'category']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Avoid converting to float16 if the column is used in one-hot encoding
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Mem. usage decreased to {end_mem:.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

### Data Pre-processing

In [6]:
from sklearn.preprocessing import LabelEncoder

def load_data_with_scaling_and_encoding():
    train_transaction = pd.read_csv('/content/train_transaction.csv')
    train_identity = pd.read_csv('/content/train_identity.csv')
    test_transaction = pd.read_csv('/content/test_transaction.csv')
    test_identity = pd.read_csv('/content/test_identity.csv')

    # Standardize column names
    test_identity.columns = test_identity.columns.str.replace('-', '_')
    test_transaction.columns = test_transaction.columns.str.replace('-', '_')

    train = train_transaction.merge(train_identity, how='left', on='TransactionID')
    test = test_transaction.merge(test_identity, how='left', on='TransactionID')

    # Free up memory
    del train_transaction, train_identity, test_transaction, test_identity
    gc.collect()

    # Apply memory optimization
    train = reduce_memory_usage(train)
    test = reduce_memory_usage(test)

    # Handle missing values
    train.fillna(-999, inplace=True)
    test.fillna(-999, inplace=True)

    # Define categorical features
    categorical_features = [
        'ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain',
        'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
        'id_33', 'id_34', 'DeviceType', 'DeviceInfo'
    ]
    categorical_features += [f'id_{i}' for i in range(12, 39)]

    # Encode categorical features
    for col in categorical_features:
        if col in train.columns:
            # Convert all values to strings to handle mixed data types
            train[col] = train[col].astype(str)
            test[col] = test[col].astype(str)

            le = LabelEncoder()
            combined_data = pd.concat([train[col], test[col]], axis=0)
            le.fit(combined_data)
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])

    return train, test

# Load data
train, test = load_data_with_scaling_and_encoding()

Mem. usage decreased to 1044.70 Mb (46.6% reduction)
Mem. usage decreased to 895.89 Mb (46.5% reduction)


In [8]:
# Prepare features and target
X = train.drop(columns=['isFraud', 'TransactionID'])
y = train['isFraud']
X_test = test.drop(columns=['isFraud', 'TransactionID'], errors='ignore')

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### Model Training

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

params = {
            'solver': 'liblinear',
            'random_state': 42
        }

def train_and_evaluate_logistic_regression(X_train, y_train, X_val, y_val):

    global params

    if params is None:
        params = {}

    # Initialize Logistic Regression model
    model = LogisticRegression(**params)

    # Train the model
    model.fit(X_train, y_train)

    # Predict probabilities
    val_pred_prob = model.predict_proba(X_val)[:, 1]  # Get probabilities for the positive class

    # Compute AUC score
    auc_score = roc_auc_score(y_val, val_pred_prob)
    print(f'Logistic Regression Validation AUC: {auc_score:.4f}')

    # Convert probabilities to binary class predictions (threshold = 0.5)
    val_pred_class = (val_pred_prob > 0.5).astype(int)

    # Print classification report & confusion matrix
    print("\nClassification Report for Logistic Regression:\n")
    print(classification_report(y_val, val_pred_class))
    print(f"Confusion Matrix for Logistic Regression:\n{confusion_matrix(y_val, val_pred_class)}")

    return model, auc_score

# Train and evaluate the Logistic Regression model
logistic_model, logistic_score = train_and_evaluate_logistic_regression(X_train, y_train, X_val, y_val)

Logistic Regression Validation AUC: 0.7443

Classification Report for Logistic Regression:

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    113975
           1       0.14      0.00      0.00      4133

    accuracy                           0.96    118108
   macro avg       0.55      0.50      0.49    118108
weighted avg       0.94      0.96      0.95    118108

Confusion Matrix for Logistic Regression:
[[113957     18]
 [  4130      3]]


Making a simple prediction from the above tarined model

In [10]:
y_pred = logistic_model.predict(X_val)

### mlflow setup

In [11]:
import dagshub
dagshub.init(repo_owner='DevDevOpsVaibhavPandey', repo_name='Notebooks', mlflow=True)

In [12]:
import mlflow
with mlflow.start_run():
  mlflow.log_params(params)
  mlflow.log_metric("Logistic Regression Validation AUC", logistic_score)

  signature = infer_signature(X_val, y_pred)

  mlflow.sklearn.log_model(
        sk_model=logistic_model,
        artifact_path="logistic-model",
        signature=signature,
        registered_model_name="sk-learn-logistic-model",
    )

Successfully registered model 'sk-learn-logistic-model'.
2025/02/07 21:35:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-logistic-model, version 1
Created version '1' of model 'sk-learn-logistic-model'.


🏃 View run vaunted-sheep-206 at: https://dagshub.com/DevDevOpsVaibhavPandey/Notebooks.mlflow/#/experiments/0/runs/bbc94d0a87c3401589a9bd6e97c83abd
🧪 View experiment at: https://dagshub.com/DevDevOpsVaibhavPandey/Notebooks.mlflow/#/experiments/0
