In [2]:
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

# Generate dataset
x, y = make_classification(
    n_samples=1000,          # Number of samples
    n_features=10,           # Total number of features
    n_informative=2,         # Number of informative features
    n_redundant=8,           # Number of redundant features
    n_classes=2,             # Number of classes
    weights=[0.9, 0.1],      # Class weights
    flip_y=0,                # Noise in labels
    random_state=42          # Random state for reproducibility
)

np.unique(y,return_counts=True)

(array([0, 1]), array([900, 100], dtype=int64))

In [3]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=42)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Define Logistic Regression with more parameters
log_reg = LogisticRegression(
    penalty='l2',           # L2 regularization
    C=0.1,                  # Regularization strength
    solver='liblinear',     # Optimization solver
    max_iter=200,           # Maximum iterations
    random_state=42         # Random state for reproducibility
)

# Train the model
log_reg.fit(x_train, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(x_test)

# Classification report as a dictionary
log_reg_report = classification_report(y_test, y_pred_log_reg, output_dict=True)
print("Logistic Regression Report (Dictionary):\n", log_reg_report)


Logistic Regression Report (Dictionary):
 {'0': {'precision': 0.9510869565217391, 'recall': 0.9831460674157303, 'f1-score': 0.9668508287292817, 'support': 178.0}, '1': {'precision': 0.8125, 'recall': 0.5909090909090909, 'f1-score': 0.6842105263157895, 'support': 22.0}, 'accuracy': 0.94, 'macro avg': {'precision': 0.8817934782608696, 'recall': 0.7870275791624106, 'f1-score': 0.8255306775225356, 'support': 200.0}, 'weighted avg': {'precision': 0.9358423913043478, 'recall': 0.94, 'f1-score': 0.9357603954637976, 'support': 200.0}}


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define Random Forest with more parameters
rf = RandomForestClassifier(
    n_estimators=200,       # Number of trees
    max_depth=10,           # Maximum depth of trees
    min_samples_split=5,    # Minimum samples required to split an internal node
    min_samples_leaf=2,     # Minimum samples required at a leaf node
    bootstrap=True,         # Use bootstrap samples
    random_state=42         # Random state for reproducibility
)

# Train the model
rf.fit(x_train, y_train)

# Make predictions
y_pred_rf = rf.predict(x_test)

# Classification report as a dictionary
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)
print("Random Forest Report (Dictionary):\n", rf_report)


Random Forest Report (Dictionary):
 {'0': {'precision': 0.967391304347826, 'recall': 1.0, 'f1-score': 0.9834254143646409, 'support': 178.0}, '1': {'precision': 1.0, 'recall': 0.7272727272727273, 'f1-score': 0.8421052631578947, 'support': 22.0}, 'accuracy': 0.97, 'macro avg': {'precision': 0.9836956521739131, 'recall': 0.8636363636363636, 'f1-score': 0.9127653387612678, 'support': 200.0}, 'weighted avg': {'precision': 0.9709782608695652, 'recall': 0.97, 'f1-score': 0.9678801977318989, 'support': 200.0}}


In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Define XGBoost with default parameters
xgb = XGBClassifier(
    n_estimators=100,       # Number of boosting rounds
    learning_rate=0.1,      # Learning rate
    max_depth=6,            # Maximum tree depth
    subsample=0.8,          # Subsample ratio of the training instance
    colsample_bytree=0.8,   # Subsample ratio of columns when constructing each tree
    random_state=42,        # Random state for reproducibility
    use_label_encoder=False, 
    eval_metric='logloss'
)

# Train the model
xgb.fit(x_train, y_train)

# Make predictions
y_pred_xgb = xgb.predict(x_test)

# Classification report as a dictionary
xgb_report = classification_report(y_test, y_pred_xgb, output_dict=True)
print("XGBoost Report (Dictionary):\n", xgb_report)


XGBoost Report (Dictionary):
 {'0': {'precision': 0.9726775956284153, 'recall': 1.0, 'f1-score': 0.9861495844875346, 'support': 178.0}, '1': {'precision': 1.0, 'recall': 0.7727272727272727, 'f1-score': 0.8717948717948718, 'support': 22.0}, 'accuracy': 0.975, 'macro avg': {'precision': 0.9863387978142076, 'recall': 0.8863636363636364, 'f1-score': 0.9289722281412032, 'support': 200.0}, 'weighted avg': {'precision': 0.9756830601092896, 'recall': 0.975, 'f1-score': 0.9735705660913417, 'support': 200.0}}


Parameters: { "use_label_encoder" } are not used.



In [7]:
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Apply SMOTETomek
smote_tomek = SMOTETomek(random_state=42)
x_train_balanced, y_train_balanced = smote_tomek.fit_resample(x_train, y_train)

# Define XGBoost
xgb_balanced = XGBClassifier(
    n_estimators=150,       # Number of boosting rounds
    learning_rate=0.05,     # Learning rate
    max_depth=5,            # Maximum tree depth
    subsample=0.9,          # Subsample ratio of the training instance
    colsample_bytree=0.9,   # Subsample ratio of columns when constructing each tree
    random_state=42,        # Random state for reproducibility
    use_label_encoder=False, 
    eval_metric='logloss'
)

# Train the model on balanced data
xgb_balanced.fit(x_train_balanced, y_train_balanced)

# Make predictions
y_pred_xgb_balanced = xgb_balanced.predict(x_test)

# Classification report as a dictionary
xgb_balanced_report = classification_report(y_test, y_pred_xgb_balanced, output_dict=True)
print("XGBoost (Balanced Data) Report (Dictionary):\n", xgb_balanced_report)


XGBoost (Balanced Data) Report (Dictionary):
 {'0': {'precision': 0.9777777777777777, 'recall': 0.9887640449438202, 'f1-score': 0.9832402234636871, 'support': 178.0}, '1': {'precision': 0.9, 'recall': 0.8181818181818182, 'f1-score': 0.8571428571428571, 'support': 22.0}, 'accuracy': 0.97, 'macro avg': {'precision': 0.9388888888888889, 'recall': 0.9034729315628192, 'f1-score': 0.9201915403032721, 'support': 200.0}, 'weighted avg': {'precision': 0.9692222222222223, 'recall': 0.97, 'f1-score': 0.9693695131683958, 'support': 200.0}}


Parameters: { "use_label_encoder" } are not used.



In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report

# Apply SMOTETomek to balance the data
smote_tomek = SMOTETomek(random_state=42)
x_train_balanced, y_train_balanced = smote_tomek.fit_resample(x_train, y_train)

# Define models with parameters
models = [
    (
        'Logistic Regression',
        LogisticRegression,
        {'penalty': 'l2', 'C': 0.1, 'solver': 'liblinear', 'max_iter': 200, 'random_state': 42},
        (x_train, y_train),
        (x_test, y_test)
    ),
    (
        'Random Forest',
        RandomForestClassifier,
        {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 
         'bootstrap': True, 'random_state': 42},
        (x_train, y_train),
        (x_test, y_test)
    ),
    (
        'XGBoost',
        XGBClassifier,
        {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 6, 'subsample': 0.8, 
         'colsample_bytree': 0.8, 'random_state': 42, 'use_label_encoder': False, 'eval_metric': 'logloss'},
        (x_train, y_train),
        (x_test, y_test)
    ),
    (
        'XGBoost (Balanced Data)',
        XGBClassifier,
        {'n_estimators': 150, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.9, 
         'colsample_bytree': 0.9, 'random_state': 42, 'use_label_encoder': False, 'eval_metric': 'logloss'},
        (x_train_balanced, y_train_balanced),
        (x_test, y_test)
    )
]

report_final=[]
# Train each model and print classification report
for name, model_class, params, train_data, test_data in models:
    x_train_model, y_train_model = train_data
    x_test_model, y_test_model = test_data
    
    # Initialize the model with parameters
    model = model_class(**params)
    
    # Train the model
    model.fit(x_train_model, y_train_model)
    
    # Make predictions
    y_pred = model.predict(x_test_model)
    
    # Classification report
    report = classification_report(y_test_model, y_pred, output_dict=True)
    report_final.append(report)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [9]:
import mlflow as mf

# Set MLflow experiment and tracking URI
experiment_name = 'anomaly_detection'
mf.set_tracking_uri('http://127.0.0.1:5000')

# Ensure the experiment exists and is correctly set
try:
    mf.set_experiment(experiment_name)
except Exception as e:
    print(f"Error setting experiment: {e}")

# Iterate over models and log parameters, metrics, and models to MLflow
for i, element in enumerate(models):
    model_name = element[0]         # Extract model name
    model_class = element[1]        # Extract model class
    params = element[2]             # Extract parameters
    train_data = element[3]         # Extract train data
    test_data = element[4]          # Extract test data
    report = report_final[i]        # Extract classification report for the model

    # Initialize the model with parameters
    model = model_class(**params)
    x_train_model, y_train_model = train_data
    x_test_model, y_test_model = test_data
    
    # Train the model
    model.fit(x_train_model, y_train_model)
    
    # Log model details in MLflow
    with mf.start_run(run_name=model_name):
        # Log model name as a parameter
        mf.log_param('model_name', model_name)

        # Log all parameters of the model
        for param_name, param_value in params.items():
            mf.log_param(param_name, param_value)

        # Log performance metrics
        mf.log_metrics({
            'accuracy': report['accuracy'],
            'recall_0': report['0']['recall'],
            'recall_1': report['1']['recall'],
            'macro_f1_score': report['macro avg']['f1-score']
        })

        # Log the trained model
        if 'XGB' in model_name:
            mf.xgboost.log_model(model, 'model')
        else:
            mf.sklearn.log_model(model, 'model')


Error setting experiment: Cannot set a deleted experiment 'anomaly_detection' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.




🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/0/runs/fab2329a2c834579a17326e289d9b5e8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Parameters: { "use_label_encoder" } are not used.



🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/0/runs/bc52f561671145d9b724254ff1de5f26
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Parameters: { "use_label_encoder" } are not used.



🏃 View run XGBoost at: http://127.0.0.1:5000/#/experiments/0/runs/2109c58a0d454298a665c75fec8f4628
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0




🏃 View run XGBoost (Balanced Data) at: http://127.0.0.1:5000/#/experiments/0/runs/fca128ec58574445b6f9a6f72bfa3e4d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


### Register model

In [10]:
import mlflow.xgboost

model_name=input('model_name')
run_id=input('enter run id')
uri=f'runs:/{run_id}/model'
result=mlflow.register_model(model_uri=uri,name=model_name)

model_name xgboost
enter run id 62228c56cf13416f8d3f933d09961b6a


Registered model 'xgboost' already exists. Creating a new version of this model...
2024/12/13 14:26:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost, version 2
Created version '2' of model 'xgboost'.


### Load model

In [11]:
model_name=input('enter model name')
model_version=1
uri=f'models:/{model_name}@challenger'
model=mlflow.xgboost.load_model(uri)
y_pred_xg=model.predict(x_test)

enter model name xgboost


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [12]:
y_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0])

### Production

In [14]:
uri=f'models:/{model_name}@challenger'
m_name='xg123'
client=mlflow.MlflowClient()
client.copy_model_version(src_model_uri=uri,dst_name=m_name)

Successfully registered model 'xg123'.
Copied version '1' of model 'xgboost' to version '1' of model 'xg123'.


<ModelVersion: aliases=[], creation_timestamp=1734080562713, current_stage='None', description='', last_updated_timestamp=1734080562713, name='xg123', run_id='c5f1a586c1364e36bf9355d6cab1c229', run_link='', source='models:/xgboost/1', status='READY', status_message='', tags={}, user_id='', version='1'>

In [18]:
uri=f'models:/{m_name}@champion'
model=mlflow.xgboost.load_model(uri)
y_pred_xg_prod=model.predict(x_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
y_pred_xg_prod

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0])