# Experiments with MLflow (training model using Linear Regression)

**Readme.md contains additional details - please take a look**

In [None]:
# install mlflow if its missing 
!pip install mlflow

In [None]:
# imports
import pandas as pd
import mlflow.sklearn
import missingno as msno
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# declare some constants
FIG_SIZE=(8, 6)

### Loading data

You can use stored csv or get data from the source

In [None]:
# read data from a local folder raw_ccpp_data = pd.read_csv('MLFLow_Experiments/static/data/CCPP_data.csv')
# get data from the source 
raw_ccpp_data = pd.read_csv('https://storage.googleapis.com/aipi_datasets/CCPP_data.csv',
                            skipinitialspace=True)

In [None]:
# let us examine the resulting dataframe
raw_ccpp_data.info

In [None]:
# display dataframe's summary statistics
raw_ccpp_data.describe()

### Data cleaning step

We want to ensure that data falls with the following specifications
- Temperature (T) in the range 1.81°C to 37.11°C
- Ambient Pressure (AP) in the range 992.89-1033.30 milibar
- Relative Humidity (RH) in the range 25.56% to 100.16%
- Exhaust Vacuum (V) in the range 25.36-81.56 cm Hg
- Net hourly electrical energy output (PE) 420.26-495.76 MW (Target we are trying to predict)

In [None]:
# query the data from raw dataframe and create new data frame
ccpp_data = raw_ccpp_data.query('(AT >= 1.81 & AT <= 37.11) &\
                                 (V >= 25.36 & V <= 81.56) &\
                                 (AP >= 992.89 & AP <= 1033.30) &\
                                 (RH >= 25.56 & RH <= 100.16)')

In [None]:
# display dataframe's summary statistics including all collumns
# we can use it to confirm the ranges as outlined above
ccpp_data.describe(include='all')

### Check for missing data

using this helpful utility https://github.com/ResidentMario/missingno

```...missingno provides a small toolset of flexible and easy-to-use missing data visualizations and utilities that allows you to get a quick visual summary of the completeness (or lack thereof) of your dataset. Just pip install missingno to get started.```

I am using ```msno.matrix``` as per official documentation "The msno.matrix nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion." Also take a look at the sparkline on the right as its helpful as well "The sparkline at right summarizes the general shape of the data completeness and points out the rows with the maximum and minimum nullity in the dataset."


In [None]:
msno.matrix(ccpp_data)
plt.figure(figsize = FIG_SIZE)
plt.show()
# absense of anything in this plot is actually a good thing - data is complete!

### Features and target selection

In [None]:
# we will use all 4 columns as our selected features
X = ccpp_data.drop('PE', axis=1)
# Target variable - the one we want to predict
y = ccpp_data['PE']

Split the data into training and testing sets

In [None]:
# 70% data for training and 30% data for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Let us see what in each set

In [None]:
# training set
print("Training")
print(X_train.describe(include='all'))

In [None]:
# test set
print("Evaluation")
print(X_test.describe(include='all'))

Prepare for training model and capturing experiment info with mlflow

MUST start mlflow prior to training the model. Use the following command
```mlflow ui```

to check that service is running goto
http://127.0.0.1:5000

Few important points
1. mflow is set to autolog for sklearn
2. log_models is set to True
3. log_datasets is set to True

Note: If you use set_tracking_uri(), you should set_experiment() after that.

In [None]:
experiment = None

# github generated URL - UPDATE
mlflow.set_tracking_uri('http://127.0.0.1:5000')
experiment_name = 'Training_Logistic_Regression_model_on_CCPP_data'

try:
    mlflow.set_experiment(experiment_name)
except Exception as ex:
    # api sometimes might throw an error
    print("error occured ", ex)

try:
    experiment = mlflow.get_experiment_by_name(experiment_name)
except Exception as ex:
    # api sometimes might throw an error
    print("error occured ", ex)

print(f"Experiment {experiment}")

# enable mlflow autologging
mlflow.sklearn.autolog(disable=False,
                       log_models=True,
                       log_datasets=True,
                       registered_model_name="CCPP Logistic Regression",
                       )

Set tags that will be useful for ML runs experiments

In [None]:
# set tags - adding metadata about the model
tags = {"team": "Engineering Team Name",
        "dataset": "CCPP model",
        "release.version": "1.2.3",
        "inputs": X_train.columns,
        "target": "PE"}

now = datetime.now() # current date and time
experiment_date = now.strftime("%m/%d/%Y, %H:%M:%S")

Run our experiment with linear regression model

Few points here
1. setup run and use datetime stamp or some other unique identifier

In [None]:
# Using linear regression model
with mlflow.start_run(experiment_id=experiment.experiment_id,
                      run_name='linear_regression_exp__' + experiment_date):
    # save tags
    mlflow.set_tags(tags)
    # instantiate model with some hyperparameters
    model = LinearRegression(n_jobs=5)
    model.fit(X_train, y_train)

    # Model Evaluation
    y_pred = model.predict(X_test)

    # calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    # sae the metrics for future comparison in MLflow
    mlflow.log_metric('mae', mae)
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)

Display metrics collected

1. MAE - measures the average absolute difference between the predicted values and the actual values. It gives you a sense of how far off your predictions are from the true values, without considering the direction (overestimation or underestimation).
1. MSE - calculates the average of the squared differences between predicted values and actual values. It emphasizes larger errors more than smaller ones due to the squaring.
1. RMSE - is the square root of the MSE. It provides a measure of the average magnitude of the errors in the same unit as the dependent variable, making it more interpretable.
1. R-squared - also known as the coefficient of determination, measures the proportion of the variance in the dependent variable that is predictable from the independent variables in a regression model. It ranges from 0 to 1, where 1 indicates a perfect fit and 0 indicates no linear relationship.

In [None]:
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

Let us use the model we just trained and predict the output PE using sample data

In [None]:
# lets try to predict
# Input features for prediction (replace with your own feature values)
new_data = pd.DataFrame({
    'AT': 14.86,
    'V': 41.66,
    'AP': 1024.06,
    'RH': 73.16,
}, index=[0])

# PE's expected output is about
expected_output = 463.26

# Predict the electrical energy output on the new data
predicted_energy_output = model.predict(new_data)

print(f"Predicted Electrical Energy Output: {predicted_energy_output[0]} and expected {expected_output}. Accuracy of {(expected_output/predicted_energy_output[0])*100}%")

Visualize predicted vs actual

In [None]:
# scatter plot to visualize predicted va actial
# note: y_test contains the actual energy output values, and y_pred contains the predicted values

plt.figure(figsize = FIG_SIZE)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Energy Output')
plt.ylabel('Predicted Energy Output')
plt.title('Actual vs. Predicted Energy Output')
plt.grid(True)
plt.show()