In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
import xgboost
import sys
import warnings
from itertools import cycle
warnings.filterwarnings("ignore")
np.random.seed(40)
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [2]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

X_train = train.drop('log_total_users', axis=1)
y_train = train.log_total_users

X_test = test.drop('log_total_users', axis=1)
y_test = test.log_total_users

In [3]:
# Load preprocessed data
with open('preprocess.pkl', 'rb') as file:
    pre = pickle.load(file)

In [4]:
# Apply preprocessing
X_train_tf = pre.transform(X_train)
X_test_tf = pre.transform(X_test)

In [5]:
def eval_metrics(actual, pred):
    mse = mean_squared_error(actual, pred)
    mape = mean_absolute_percentage_error(actual, pred)
    return rmse, mae

In [6]:
# Define the experiment name
experiment_name = "end-to-end-mlops"

In [7]:
# Check if the experiment exists
experiment = mlflow.get_experiment_by_name(experiment_name)

In [8]:
if experiment is None:
    # If the experiment does not exist, create it
    mlflow.create_experiment(experiment_name)

# Set the experiment
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///D:/praxis/Term%202/mlops/end%20to%20end%20mlops/end_to_end_mlops_project/End_to_End_MLops/mlruns/374554894569616120', creation_time=1704970263148, experiment_id='374554894569616120', last_update_time=1704970263148, lifecycle_stage='active', name='end-to-end-mlops', tags={}>

In [None]:

# Load Diabetes datasets
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

# Create pandas DataFrame for sklearn ElasticNet linear_model
Y = np.array([y]).transpose()
d = np.concatenate((X, Y), axis=1)
cols = diabetes.feature_names + ["progression"]
data = pd.DataFrame(d, columns=cols)

# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data)

# The predicted column is "progression" which is a quantitative measure of disease progression one year after baseline
train_x = train.drop(["progression"], axis=1)
test_x = test.drop(["progression"], axis=1)
train_y = train[["progression"]]
test_y = test[["progression"]]

# Define the experiment name
experiment_name = "diaclf_experiment"

# Check if the experiment exists
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    # If the experiment does not exist, create it
    mlflow.create_experiment(experiment_name)

# Set the experiment
mlflow.set_experiment(experiment_name)

# Define the hyperparameters to try
alpha_values = [0.01, 0.05]
l1_ratio_values = [0.01, 0.05]

for alpha in alpha_values:
    for l1_ratio in l1_ratio_values:
        with mlflow.start_run(run_name=f"alpha_{alpha}_l1_ratio_{l1_ratio}"):
            # Run ElasticNet
            lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
            lr.fit(train_x, train_y)
            predicted_qualities = lr.predict(test_x)
            (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

            # Print out ElasticNet model metrics
            print(f"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):")
            print(f"  RMSE: {rmse}")
            print(f"  MAE: {mae}")
            print(f"  R2: {r2}")

            # Infer model signature
            predictions = lr.predict(train_x)
            signature = infer_signature(train_x, predictions)

            # Log mlflow attributes for mlflow UI
            mlflow.log_param("alpha", alpha)
            mlflow.log_param("l1_ratio", l1_ratio)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("mae", mae)
            mlflow.sklearn.log_model(lr, "model", signature=signature)

# end current run
mlflow.end_run()

# Get experiment ID
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

# Search all runs in experiment_id
runs = mlflow.search_runs(experiment_ids=experiment_id)

# display runs dataframe
runs
