Let's train the model with some observability. We'll use MLflow.

In [12]:
# **DATA PROCESSING**
import numpy as np # Array Processing
import pandas as pd # Data Processing
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

# **MACHINE LEARNING MODELS**
from sklearn.linear_model import LinearRegression

# **METRICS**
from sklearn.metrics import r2_score

# **EXPERIMENT TRACKING**
import mlflow
import pickle

In [13]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Bodyfat-Percent-Prediction-Experiment")

<Experiment: artifact_location='file:///C:/Users/Odiaka/My_Shit/stadium/artifacts/1', creation_time=1686651226992, experiment_id='1', last_update_time=1686651226992, lifecycle_stage='active', name='Bodyfat-Percent-Prediction-Experiment', tags={}>

In [14]:
def read_dataframe(filename):
    df = pd.read_csv(filename)

    #df["Density"] = np.where(df["Density"] < 1.15 , df["Density"].mean() , df["Density"])
    #df["Age"] = np.where(df["Age"] < 100 , df["Age"].mean() , df["Age"])
    df["Weight"] = np.where(df["Weight"] > 250 , df["Weight"].mean() , df["Weight"])
    df["Height"] = np.where(df["Height"] < 30 , df["Height"].mean() , df["Height"])
    df["Neck"] = np.where(df["Neck"] < 32.5 , df["Neck"].mean() , df["Neck"])
    df["Neck"] = np.where(df["Neck"] > 45 , df["Neck"].mean() , df["Neck"])
    df["Chest"] = np.where(df["Chest"] > 125 , df["Chest"].mean() , df["Chest"])
    df["Abdomen"] = np.where(df["Abdomen"] > 120 , df["Abdomen"].mean() , df["Abdomen"])
    df["Hip"] = np.where(df["Hip"] > 115 , df["Hip"].mean() , df["Hip"])
    df["Thigh"] = np.where(df["Thigh"] > 72 , df["Thigh"].mean() , df["Thigh"])
    df["Knee"] = np.where(df["Knee"] > 44 , df["Knee"].mean() , df["Knee"])
    df["Biceps"] = np.where(df["Biceps"] > 40 , df["Biceps"].mean() , df["Biceps"])
    df["Forearm"] = np.where(df["Forearm"] > 34 , df["Forearm"].mean() , df["Forearm"])
    df["Forearm"] = np.where(df["Forearm"] < 25 , df["Forearm"].mean() , df["Forearm"])
    df["Wrist"] = np.where(df["Wrist"] > 20.5 , df["Wrist"].mean() , df["Wrist"])
    df["Wrist"] = np.where(df["Wrist"] < 16 , df["Wrist"].mean() , df["Wrist"])

    return df

def split_dataframe(dataframe):
    train , test = np.split(dataframe.sample(frac = 1) , [int(0.8 * len(dataframe))])
    return train, test

def prepare_features(dataframe):
    x = dataframe.drop("BodyFat" , axis = 1)
    y = dataframe["BodyFat"]
    return x , y

def prepare_dictionaries(df):
    df_dicts = df.to_dict(orient='records')
    return df_dicts



In [15]:
data = read_dataframe('data/bodyfat.csv')
train, val = split_dataframe(data)
Xtrain, ytrain = prepare_features(train)
Xval, yval = prepare_features(val)
train_dicts = prepare_dictionaries(Xtrain)
val_dicts = prepare_dictionaries(Xval)

In [16]:
with mlflow.start_run():
    mlflow.set_tag('Developer', '🅱🅻🅰🆀')

    mlflow.set_tag('Description', 'Second training with (dv+model) pipeline')
    
    pipeline = make_pipeline(
        DictVectorizer(),
        LinearRegression()
    )

    pipeline.fit(train_dicts, ytrain)
    y_pred = pipeline.predict(val_dicts)
    

    score = r2_score(yval, y_pred)
    print(f'These model gave this r2 score {score}')
    mlflow.log_metric('r2 score', score)
    

    mlflow.sklearn.log_model(pipeline, artifact_path="model")

These model gave this r2 score 0.9792049200840283


In [None]:
with open('models/pipeline.bin', 'wb') as f_in:
    pickle.dump((pipeline), f_in)