# Welcome to lean MLOps @ DevConf 2023!



## Quick Overview 
- What is MLOps? 
- Comparison of SW products and ML products 
- A taster of unqiue aspects of operating machine learning products
- Demo 
- Q&A





### Lets pretend environment variables have been setup and injected into our container 

In [41]:
import os
import warnings
warnings.filterwarnings('ignore')

os.environ["MLFLOW_URI"] = "mlruns"
os.environ["LOCAL_DATASTORE"] = "data_local"
os.environ["REMOTE_DATASTORE"] = "data_dvc_remote"
os.environ["DUCK_DB_DWH"] = "data_local/feature_store.duckdb"

In [2]:
from mlops.featurestore import duck_db_conn

## DuckDB: a data scientist's and data engineers best friend!

A quick explore with duckdb, an analytics database that runs in-process and has first class features


In [3]:
import mlflow
import dvc
import duckdb

In [4]:
conn = duck_db_conn()


In [5]:
conn.sql("""
  CREATE SCHEMA IF NOT EXISTS raw;
  CREATE SCHEMA IF NOT EXISTS features_california_housing;
  CREATE TABLE IF NOT EXISTS raw.california_housing AS 
     (SELECT * FROM "starter_data/california_housing.parquet"
     );
  CREATE TABLE IF NOT EXISTS raw.german_credit AS 
     (SELECT * FROM "starter_data/german_credit.csv"
     );
""")


In [6]:
conn.sql("""
   select * from raw.california_housing limit 10
""").to_df()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.12,37.89,30.0,3227.0,733.0,1260.0,684.0,4.125,257100.0
1,-122.12,37.81,26.0,4048.0,513.0,1486.0,498.0,7.6717,416500.0
2,-122.12,37.7,41.0,3495.0,787.0,1849.0,750.0,2.679,144900.0
3,-122.12,37.69,35.0,2681.0,508.0,1580.0,536.0,4.1042,179100.0
4,-122.12,37.64,40.0,432.0,102.0,264.0,77.0,3.8875,228100.0
5,-122.12,37.48,36.0,880.0,177.0,795.0,188.0,3.8194,159400.0
6,-122.12,37.41,33.0,2892.0,617.0,1250.0,581.0,5.3727,360900.0
7,-122.12,37.16,32.0,1602.0,317.0,752.0,275.0,5.1664,185100.0
8,-122.13,39.74,20.0,1401.0,280.0,668.0,250.0,2.2569,94300.0
9,-122.13,37.74,41.0,4400.0,666.0,1476.0,648.0,5.0,248900.0


In [47]:
conn.sql("""
   CREATE OR REPLACE TABLE features_california_housing.social_feats AS
   (
      select 
          round(population/households, 4) as household_density,
          (median_income * 10000)/population as median_income_per_person,
          round(total_bedrooms/total_rooms) as bed_room_ratio,
          households,
          population,
          median_house_value
      from raw.california_housing

   );

   CREATE  OR REPLACE  TABLE features_california_housing.location_features AS
   (
      select 
           latitude,
           longitude,
           round(total_rooms/households) as mean_rooms,
           round(total_bedrooms/households) as mean_bedrooms,
           median_house_value,
      from raw.california_housing

   );


""")

In [8]:
from mlops.utils import DuckDBUtils
db_utils = DuckDBUtils(conn)

In [7]:
import plotly.express as px
df = conn.sql("select * from features_california_housing.social_feats").to_df()
px.scatter_3d(df, x='household_density', 
              y='median_house_value', 
              z='median_income_per_person', 
              size='households')

### Lets get training

Think of the cell below as a module in our code base

In [51]:
from mlops.pipeline import MLPipeline, MLProduct, MLProductVariant
from mlops.ml_common import get_random_name, get_next_model_id_for
from mlops.utils import get_logger
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from typing import Tuple


social_variant = MLProductVariant(MLProduct("california_housing"), variant_name="social_features")

def social_features()->duckdb.DuckDBPyRelation:
    conn = duck_db_conn()
    return conn.sql("""
                      DROP SEQUENCE indexer;
                      CREATE SEQUENCE indexer START 1;
                      select
                          nextval('indexer') as idx,
                          round(population/households, 4) as household_density,
                          (median_income * 10000)/population as median_income_per_person,
                          round(total_bedrooms/total_rooms) as bed_room_ratio,
                          households,
                          population,
                          median_house_value
                      from raw.california_housing""")


def train_test_split(featset: duckdb.DuckDBPyRelation, frac: float)->Tuple[duckdb.DuckDBPyRelation, duckdb.DuckDBPyRelation]:
    conn = duck_db_conn()
    train_data = conn.sql(f"""
                         select * from featset using sample {frac*100}%
                          """)
    test_data = conn.sql("""
                         select f.* from featset f 
                         anti join train_data tr
                         on f.idx = tr.idx
                         """)
    return (train_data, test_data)



def log_datasets(train_data: duckdb.DuckDBPyRelation, test_data: duckdb.DuckDBPyRelation, ml_prod_info: MLProductVariant):
    conn = duck_db_conn()
    path = f"""data_local/features/{ml_prod_info.ml_product.ml_product_name}/{ml_prod_info.variant_name}"""
    conn.sql(f"""COPY train_data TO '{path}/tr_data.parquet' 
                 (FORMAT PARQUET);
               """)
    conn.sql(f"""COPY test_data TO '{path}/te_data.parquet' 
                 (FORMAT PARQUET);
               """)
    

def train_model(featset: duckdb.DuckDBPyRelation):

    pred_vars = ["household_density", "median_income_per_person", "bed_room_ratio", "households", "population"]
    response_var = "median_house_value"
    conn = duck_db_conn()
    
    rand_name = get_random_name()
    mlflow.set_experiment(social_variant.ml_product.ml_product_name)
    with mlflow.start_run(run_name=rand_name) as active_run:
        mlflow.set_tag('training.source', 'lean MLops @ devconf')
        mlflow.set_tag('variant_name', social_variant.variant_name)
        mlflow.set_tag('model_version', get_next_model_id_for(social_variant.ml_product.ml_product_name, social_variant.variant_name))
        mlflow.set_tag('model_type', "LinearRegressor")
        
        train_data, test_data = train_test_split(featset, 0.7)
        train_df = train_data.to_df()
        test_df = test_data.to_df()

        log_datasets(train_data, test_data, social_variant)
        
        X_tr = train_df[pred_vars]
        X_te = test_df[pred_vars]
        y_tr = train_df[response_var]
        y_te = test_df[response_var]
        mlflow.log_metric("training obersavations", X_tr.shape[0])
        mlflow.log_metric("test obersavations", X_te.shape[0])
        
        
        fitted_model = LinearRegression().fit( X_tr, y_tr)
        y_tr_pred = fitted_model.predict(X_tr)
        y_te_pred = fitted_model.predict(X_te)
        test_r2 = r2_score(y_te, y_te_pred).round(3)
        test_rmse = round(np.sqrt(mean_squared_error(y_te,y_te_pred)),0)
        mlflow.log_metric("test r2_score", test_r2)
        mlflow.log_metric("test rmse", test_rmse)

        mlflow.sklearn.log_model(fitted_model, 'model')


In [52]:
rel = social_features()
train_model(rel)

## Its all about location!
![map of california with area median property values](images/location_location.png "Title")

Lets use some lcoation oriented features with a Random Forrest Regressor.




In [53]:
from mlops.pipeline import MLPipeline


class CaliforniaHousingLocationPipeline(MLPipeline):

    def __init__(self):
        super().__init__(product_name="california_housing",  variant_name="location_features")
        
    def location_features(self)->duckdb.DuckDBPyRelation:
        return self._duck_db.sql("""
                          DROP SEQUENCE indexer;
                          CREATE SEQUENCE indexer START 1;
                           select 
                               nextval('indexer') as idx,
                               latitude,
                               longitude,
                               round(total_rooms/households) as mean_rooms,
                               round(total_bedrooms/households) as mean_bedrooms,
                               median_house_value,
                          from raw.california_housing""")

    def train_model(self, featset: duckdb.DuckDBPyRelation):
    
        pred_vars = ["latitude", "longitude","mean_rooms", "mean_bedrooms"]
        response_var = "median_house_value"
        
        rand_name = get_random_name()
        mlflow.set_experiment(self._info.ml_product.ml_product_name)
        with mlflow.start_run(run_name=rand_name) as active_run:
            mlflow.set_tag('training.source', 'lean MLops @ devconf')
            mlflow.set_tag('variant_name', self._info.variant_name)
            mlflow.set_tag('model_version', get_next_model_id_for(self._info.ml_product.ml_product_name, self._info.variant_name))
            mlflow.set_tag('model_type', "RandomForestRegressor")
            
            train_data, test_data = train_test_split(featset, 0.7)
            train_df = train_data.to_df()
            test_df = test_data.to_df()
    
            log_datasets(train_data, test_data, self._info)
            
            X_tr = train_df[pred_vars]
            X_te = test_df[pred_vars]
            y_tr = train_df[response_var]
            y_te = test_df[response_var]
            mlflow.log_metric("training obersavations", X_tr.shape[0])
            mlflow.log_metric("test obersavations", X_te.shape[0])
            
            
            fitted_model = RandomForestRegressor(n_estimators=10).fit( X_tr, y_tr)
            y_tr_pred = fitted_model.predict(X_tr)
            y_te_pred = fitted_model.predict(X_te)
            test_r2 = r2_score(y_te, y_te_pred).round(3)
            test_rmse = round(np.sqrt(mean_squared_error(y_te,y_te_pred)),0)
            mlflow.log_metric("test r2_score", test_r2)
            mlflow.log_metric("test rmse", test_rmse)
    
            mlflow.sklearn.log_model(fitted_model, 'model')

    def run(self):
        location_features = self.location_features()
        self.train_model(location_features)
         


In [54]:
location_training_pipe = CaliforniaHousingLocationPipeline()
location_training_pipe.run()

## Prediction 


In [44]:
conn.sql("""
   select * from raw.california_housing limit 10
""").to_df()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.12,37.89,30.0,3227.0,733.0,1260.0,684.0,4.125,257100.0
1,-122.12,37.81,26.0,4048.0,513.0,1486.0,498.0,7.6717,416500.0
2,-122.12,37.7,41.0,3495.0,787.0,1849.0,750.0,2.679,144900.0
3,-122.12,37.69,35.0,2681.0,508.0,1580.0,536.0,4.1042,179100.0
4,-122.12,37.64,40.0,432.0,102.0,264.0,77.0,3.8875,228100.0
5,-122.12,37.48,36.0,880.0,177.0,795.0,188.0,3.8194,159400.0
6,-122.12,37.41,33.0,2892.0,617.0,1250.0,581.0,5.3727,360900.0
7,-122.12,37.16,32.0,1602.0,317.0,752.0,275.0,5.1664,185100.0
8,-122.13,39.74,20.0,1401.0,280.0,668.0,250.0,2.2569,94300.0
9,-122.13,37.74,41.0,4400.0,666.0,1476.0,648.0,5.0,248900.0


In [55]:
import mlflow
import pandas as pd
logged_model = 'runs:/7ec0022616b7427a8db04963288e5bee/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
out_of_sample_test = pd.DataFrame([ {"latitude": 32.843, "longitude": 117.270,"mean_rooms": 10 , "mean_bedrooms": 5 },
                                   {"latitude": 32.84, "longitude": -117.46 ,"mean_rooms": 5, "mean_bedrooms": 3 }   
                                 ])
loaded_model.predict(out_of_sample_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


array([ 62030.        , 384446.66666667])

In [49]:
import plotly.express as px

df = conn.sql("""
    select * from features_california_housing.location_features 
""").to_df()

px.histogram(df, x="median_house_value")


## DVC 
Data Version Control is nascent tool from iterative.ai that uses git to track dataset and commit manifest information to your ml product code repositories

In [33]:
!dvc add data_local/features/california_housing/social_features/tr_data.parquet

[?25l                                                                          ⠋ Checking graph
Adding...                                                                       
!
  0% Checking cache in '/Users/austin/dev/ee/dev-conf-2023-lean-mlops/.dvc/cache
                                                                                
!
  0%|          |Transferring                          0/? [00:00<?,     ?file/s]
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s]
                                                                                
!
  0%|          |Checking out data_local/features/calif0/? [00:00<?,    ?files/s]
  0%|          |Checking out data_local/features/calif0/1 [00:00<?,    ?files/s]
100% Adding...|████████████████████████████████████████|1/1 [00:00, 49.48file/s]


In [None]:
### DVC

```shell
dvc init
dvc add <dataset>

```

## German credit - AI governance and data healing

This dataset from 1994 originally contained some bias encoding features, such as:

![map of california with area median property values](images/bias_features.jpg "Title")

In [50]:
conn.sql("""
            select *
            from raw.german_credit limit 10
        """).to_df()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
5,35,male,1,free,,,9055,36,education,good
6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good
7,35,male,3,rent,little,moderate,6948,36,car,good
8,61,male,1,own,rich,,3059,12,radio/TV,good
9,28,male,3,own,little,moderate,5234,30,car,bad


## AI Governance concerns

![governance considerations](images/ai_governance.png "AI Governance")


## Model evaluaton and performance 

![model performance](images/model_eval_and_performance.png "Model performance")

