# **MLFlow with Scikit-Learn**

# Data Processing

In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn #
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, plot_roc_curve,confusion_matrix
from sklearn.model_selection import KFold
import mlflow
import mlflow.sklearn
print("Numpy: {}".format(np.__version__))
print("Pandas: {}".format(pd.__version__))
print("matplotlib: {}".format(matplotlib.__version__))
print("seaborn: {}".format(sns.__version__))
print("Scikit-Learn: {}".format(sklearn.__version__))
print("MLFlow: {}".format(mlflow.__version__))

Numpy: 1.23.5
Pandas: 1.5.2
matplotlib: 3.5.3
seaborn: 0.12.1
Scikit-Learn: 1.1.3
MLFlow: 2.7.0


In [3]:
data_path = "Data/creditcard.csv"
df = pd.read_csv(data_path)
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df = df.drop("Time", axis=1)

In [5]:
# Moving on, you will split the normal points and the anomalies:
normal = df[df.Class == 0].sample(frac=0.5,
random_state=2020).reset_index(drop=True)
anomaly = df[df.Class == 1]

In [6]:
# Let’s print out their respective shapes:
print(f"Normal: {normal.shape}")
print(f"Anomaly: {anomaly.shape}")

Normal: (142158, 30)
Anomaly: (492, 30)


Randomly sampling 50% of all the normal data points
in the data frame and picking out all of the anomalies from the data
frame as separate data frames. Then, you print the shapes of both
data sets. As you can see, the normal points massively outnumber the
anomaly points

In [7]:
normal_train, normal_test = train_test_split(normal,test_size = 0.2, random_state = 2020)
anomaly_train, anomaly_test = train_test_split(anomaly, test_size = 0.2, random_state = 2020)
normal_train, normal_validate = train_test_split(normal_train,test_size = 0.25, random_state = 2020)
anomaly_train, anomaly_validate = train_test_split(anomaly_train, test_size = 0.25, random_state = 2020)

 Partitioning the normal and anomaly data frames
separately into train, test, and validation splits. Initially, 20% of
the normal and anomaly points are used as the test split. From
the remaining 80% of data, 25% of that train split is used as the
validation split, meaning the validation split is 20% of the original
data. This leaves the final training split at 60% of the original data. In
the end, the train-test-validate split has a 60-20-20 ratio, respectively

In [8]:
# Now, you can process these sets and create the x-y splits:
x_train = pd.concat((normal_train, anomaly_train))
x_test = pd.concat((normal_test, anomaly_test))
x_validate = pd.concat((normal_validate, anomaly_validate))
y_train = np.array(x_train["Class"])
y_test = np.array(x_test["Class"])
y_validate = np.array(x_validate["Class"])
x_train = x_train.drop("Class", axis=1)
x_test = x_test.drop("Class", axis=1)
x_validate = x_validate.drop("Class", axis=1)

Creating the respective x and y splits of the training,
testing, and validation sets by concatenating the respective normal
and anomaly sets. You drop Class from the x-sets because it would be
cheating otherwise to give it the label directly. You are trying to get the
model to learn the labels by reading the x-data, not learn how to read
the Class column in the x-data

In [9]:
print("Training sets:\nx_train: {} \ny_train:{}".format(x_train.shape, y_train.shape))
print("\nTesting sets:\nx_test: {} \ny_test:{}".format(x_test.shape, y_test.shape))
print("\nValidation sets:\nx_validate: {} \ny_validate: {}".format(x_validate.shape, y_validate.shape))

Training sets:
x_train: (85588, 29) 
y_train:(85588,)

Testing sets:
x_test: (28531, 29) 
y_test:(28531,)

Validation sets:
x_validate: (28531, 29) 
y_validate: (28531,)


In [10]:
# Finally, you scale your data using scikit-learn’s standard scaler:
scaler = StandardScaler()
scaler.fit(pd.concat((normal, anomaly)).drop("Class", axis=1))
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
x_validate = scaler.transform(x_validate)

 Fitting the scaler on the superset of normal and anomaly
points after dropping Class to scale the x-sets

## Training and Evaluating with MLFlow

In [11]:
def train(sk_model, x_train, y_train):
    sk_model = sk_model.fit(x_train, y_train)
    train_acc = sk_model.score(x_train, y_train)
    mlflow.log_metric("train_acc", train_acc)
    print(f"Train Accuracy: {train_acc:.3%}")

Defining the train function to better organize the code.
Additionally, you are defining a training accuracy metric that will be
logged by MLFlow

In [12]:
mlflow.log_metric("train_acc", train_acc)

NameError: name 'train_acc' is not defined

In [13]:
def evaluate(sk_model, x_test, y_test):
    eval_acc = sk_model.score(x_test, y_test)
    preds = sk_model.predict(x_test)
    auc_score = roc_auc_score(y_test, preds)
    mlflow.log_metric("eval_acc", eval_acc)
    mlflow.log_metric("auc_score", auc_score)
    print(f"Auc Score: {auc_score:.3%}")
    print(f"Eval Accuracy: {eval_acc:.3%}")
    roc_plot = plot_roc_curve(sk_model, x_test, y_test,
    name='Scikit-learn ROC Curve')
    plt.savefig("sklearn_roc_plot.png")
    plt.show()
    plt.clf()
    conf_matrix = confusion_matrix(y_test, preds)
    ax = sns.heatmap(conf_matrix, annot=True,fmt='g')
    ax.invert_xaxis()
    ax.invert_yaxis()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title("Confusion Matrix")
    plt.savefig("sklearn_conf_matrix.png")
    mlflow.log_artifact("sklearn_roc_plot.png")
    mlflow.log_artifact("sklearn_conf_matrix.png")

A function to calculate the evaluation metrics for the
AUC score and accuracy. Plots for the confusion matrix and the ROC
curve are generated, and both the metrics and the graphs are logged
to MLFlow

In [14]:
mlflow.log_metric("eval_acc", eval_acc)
mlflow.log_metric("auc_score", auc_score)

NameError: name 'eval_acc' is not defined

In [15]:
mlflow.log_artifact("sklearn_roc_plot.png")
mlflow.log_artifact("sklearn_conf_matrix.png")

FileNotFoundError: [Errno 2] No such file or directory: 'sklearn_roc_plot.png'

## Logging and Viewing MLFlow Runs

In [16]:
sk_model = LogisticRegression(random_state=None,
max_iter=400, solver='newton-cg')
mlflow.set_experiment("scikit_learn_experiment")
with mlflow.start_run():
    train(sk_model, x_train, y_train)
    evaluate(sk_model, x_test, y_test)
    mlflow.sklearn.log_model(sk_model, "log_reg_model")
    print("Model run: ", mlflow.active_run().info.run_uuid)
mlflow.end_run()

2024/01/08 22:32:44 INFO mlflow.tracking.fluent: Experiment with name 'scikit_learn_experiment' does not exist. Creating a new experiment.


Exception: Run with UUID b0df6fa0ecd84eccbbf003055f19ac4b is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [None]:
mlflow.set_experiment("scikit_learn_experiment")