In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

# step 1: create an imbalanced binary classification dataset
X,y=make_classification(n_samples=1000,n_features=10,n_informative=2,n_redundant=8,
                       weights=[0.9,0.1],flip_y=0,random_state=42)
np.unique(y,return_counts=True)

# Split the dataset into training and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)

# Define the model hyperparams
params={
    "solver":"lbfgs",
    "max_iter":1000,
    "multi_class":"auto",
    "random_state":8888,
}

# Train the model
lr=LogisticRegression(**params)
lr.fit(X_train,y_train)
y_pred=lr(X_test)
report=classification_report(y_test,y_pred)
print(report) # precision, recall, F1, support

report_dict=classification_report(y_test,y_pred,output_dict=True)
print(report_dict)

# Use MLFlow to log the experiment
import mlflow
mlflow.set_experiment("First Experiment")
mlflow.set_tracking_url("http://127.0.0.1:5000") # the localhost if MLFlow is run on local
with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy'=report_dict['accuracy'],
        'recall_class_0':report_dict['0']['recall'],
        'recall_class_1':report_dict['1']['recall'],
        'f1_score_macro':report_dict['macro avg']['f1-score']
    })
    mlflow.sklearn.log_model(lr,"LR_1") # for logistic regression

# The in the list of experiments, we will see the "First Element" in the user interface!
# Inside the link, we will see all the parameters, the model, the metrics, etc.
# Under the artifacts section, we can see the model.pkl, so that we can package or download et deploy it via Docker later
# conda.yaml, python_env.yaml, requirements.txt

In [None]:
# Experiment 2
xgb_clf=XGBClassifier(use_label_encoder=False,eval_metric='logloss')
xgb_clf.fit(X_train,y_train)
y_pred_xgb=xgb_clf.predict(X_test)
report=classification_report(y_test,Y_pred_xgb)

# similarly, we can make Experiment 3 & 4, etc.

# create a new dataset
from imblearn.combine import SMOTETomek
smt=SMOTETomek(random_state=42)
X_train_res,y_train_res=smt.fit_resample(X_train,y_train)
np.unique(y_train_res,return_counts=True)

# we can create a list of models
models=[
    (
        "Logistic Regression",
        {C=1,solver='liblinear'}
        LogisticRegression(),
        (X_train,y_train),
        (X_test,y_test)
    ),
    (
        "Random Forest",
        {n_estimators=30,max_depth=3},
        RandomForestClassifier(),
        (X_train,y_train),
        (X_test,y_test)
    ),
    (
        "XGBClassifier",
        {use_label_encoder=False,eval_metric='logloss'},
        XGBClassifier(),
        (X_train,y_train),
        (X_test,y_test)
    ),
    (
        "XGBClassifier With SMOTE",
        {use_label_encoder=False,eval_metric='logloss'},
        XGBClassifier(),
        (X_train_res,y_train_res),
        (X_test,y_test)
    )
]

# go through all models and create all reports
reports=[]
for model_name,params,model,train_set,test_set in models:
    X_train=train_set[0]
    y_train=train_set[1]
    X_test=test_set[0]
    y_test=test_set[1]
    model.set_params(**params)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    report=classification_report(y_test,y_pred,output_dict=True)
    reports.append(report)

# use git/dagshub
import dagshub
# we need envs for connection
import os
os.environ['MLFLOW_TRACKING_USERNAME']=''
os.environ['MLFLOW_TRACKING_PASSWORD']=''
os.environ['MLFLOW_TRACKING_URI']=''
dagshub.init(repo_owner='learnpythonlanguage',repo_name='mlflow_dagshub_demo',mlflow=True)
# we need to change the url below to deploy to dagshub

mlflow.set_experiment("Anomaly Detection")
mlflow.set_tracking_url("http://127.0.0.1:5000") # the localhost if MLFlow is run on local

for i, e in enumerate(models):
    model_name=e[0],
    model=e[2],
    params=e[1],
    report=reports[i]
    with mlflow.start_run(run_name=model_name):
        mlflow.log_params('model_name',model_name)
        mlflow.log_params(params)
        mlflow.log_metrics('accuracy',report['accuracy'])
        mlflow.log_metrics('recall_0',report['0']['recall'])
        mlflow.log_metrics('recall_1',report['1']['recall'])
        mlflow.log_metrics('f1_score_macro',report['macro avg']['f1-score'])
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model,"model")
        else:
            mlflow.sklearn.log_model(model,"model")

# now we can see everything we logged on the MLFlow interface
# we can see all the metrics and even compare them across different models!

In [None]:
# register a model
model_name="XGB-Smote"
run_id=input("Enter run ID:")
model_uri=f"runs/{run_id}/{model_name}"
result=mlflow.register_model(
    model_uri,model_name
)

# load a model
model_version=1
model_uri_to_load=f"models:/{model_name}/{model_version}"
# we can optionally use a tag like f"models/{model_name}@challenge"
loaded_model=mlflow.xgboost.load_model(model_uri_to_load)
print(loaded_model.predict(X_test)[:4])

# transition from d√©v to prod
dev_model_uri=f"models:/{model_name}@challenge"
prod_model='anomaly-detection-prod'
client=mlflow.MLflowClient()
client.copy_model_version(src_model_uri=dev_model_uri,dst_name=prod_model)

prod_model_uri=f"models:/{prod_model}@champion"
loaded_model_prod=mlflow.xgboost.load_model(prod_model_uri)

