In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import root_mean_squared_error, r2_score
import pickle
import mlflow


%matplotlib inline

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("duration-prediction")

MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=duration-prediction (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7be4b4c4c980>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [3]:
df = pd.read_parquet("../data/yellow_tripdata_2024-07.parquet")
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2024-07-01 00:34:56,2024-07-01 00:46:49,1.0,3.2,1.0,N,140,79,1,15.6,3.5,0.5,3.5,0.0,1.0,24.1,2.5,0.0
1,2,2024-06-30 23:48:58,2024-07-01 00:28:04,1.0,19.48,2.0,N,132,113,2,70.0,0.0,0.5,0.0,0.0,1.0,75.75,2.5,1.75
2,2,2024-07-01 00:23:18,2024-07-01 00:29:51,1.0,1.18,1.0,N,237,145,1,8.6,1.0,0.5,2.72,0.0,1.0,16.32,2.5,0.0
3,1,2024-07-01 00:10:33,2024-07-01 00:27:31,0.0,9.1,1.0,N,138,164,1,36.6,10.25,0.5,12.05,0.0,1.0,60.4,2.5,1.75
4,1,2024-07-01 00:07:55,2024-07-01 00:34:34,1.0,17.7,2.0,N,132,263,1,70.0,1.75,0.5,10.0,6.94,1.0,90.19,0.0,1.75


In [4]:
del df

Read Data

In [5]:
categories = ["PULocationID", "DOLocationID"]
numerics = (
    []
)  # ['trip_distance','passenger_count','fare_amount','tip_amount','congestion_surcharge','Airport_fee','tpep_pickup_datetime']


def read_data(filename):
    df = pd.read_parquet(filename)
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = df["duration"].map(lambda x: x.total_seconds() / 60)

    df = df[(df["duration"] >= 0) & (df["duration"] <= 60)]
    df[categories] = df[categories].astype(object)

    df["tpep_pickup_datetime"] = df["tpep_pickup_datetime"].astype(int)

    prepped = df[categories + numerics + ["duration"]].dropna()
    return prepped

In [6]:
train_data = read_data("../data/yellow_tripdata_2024-07.parquet")

In [7]:
train_data["duration"].describe(percentiles=[0.5, 0.95, 0.99])

count    3.022755e+06
mean     1.549464e+01
std      1.111104e+01
min      0.000000e+00
50%      1.258333e+01
95%      3.926667e+01
99%      5.356667e+01
max      6.000000e+01
Name: duration, dtype: float64

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(train_data["duration"])
plt.show()

NameError: name 'train_data' is not defined

In [8]:
train_data.head()

Unnamed: 0,PULocationID,DOLocationID,duration
0,140,79,11.883333
1,132,113,39.1
2,237,145,6.55
3,138,164,16.966667
4,132,263,26.65


Training pipeline


In [9]:
dv = DictVectorizer()
x_train = dv.fit_transform(train_data[categories + numerics].to_dict(orient="records"))

In [10]:
dv.feature_names_

['DOLocationID', 'PULocationID']

In [11]:
target = "duration"
y_train = train_data[target].values
y_train

array([11.88333333, 39.1       ,  6.55      , ..., 22.        ,
       18.1       , 18.41666667])

In [12]:
test_data = read_data("../data/yellow_tripdata_2024-08.parquet")
x_test_dict = test_data[categories + numerics].to_dict(orient="records")
x_test = dv.transform(x_test_dict)
y_test = test_data[target].values

Linear Regression

In [13]:
%pwd

'/home/codespace/mlops-zoomcamp/chapter2'

In [14]:
with mlflow.start_run():
    mlflow.set_tag("developer", value="Derek")
    mlflow.set_tag("model", value="linear_regression")

    lm = LinearRegression()
    lm.fit(x_train, y_train)
    preds = lm.predict(x_test)
    rmse = root_mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(
        f"The model has an average error of {rmse:.2f} minutes,while explaining {r2:.2f} of the variance"
    )
    mlflow.log_params(params=lm.get_params())
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_artifact("../models/intro_lm.bin", "model")

The model has an average error of 11.08 minutes,while explaining 0.02 of the variance


2024/10/28 15:50:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run learned-skunk-377 at: http://127.0.0.1:5000/#/experiments/1/runs/05a14157c63b453fa14168426b18573b.
2024/10/28 15:50:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [15]:
resids = pd.DataFrame({"actual": y_test, "predicted": preds})
resids["residual"] = resids["actual"] - resids["predicted"]
resids["abs_resid"] = resids["residual"].abs()
resids.sort_values("abs_resid", ascending=False)

Unnamed: 0,actual,predicted,residual,abs_resid
2434206,59.933333,12.168918,47.764415,47.764415
1533981,59.916667,12.168918,47.747748,47.747748
335859,60.000000,12.267402,47.732598,47.732598
861565,59.766667,12.168918,47.597748,47.597748
1165082,59.683333,12.193165,47.490169,47.490169
...,...,...,...,...
2158445,14.900000,14.900020,-0.000020,0.000020
2158444,14.900000,14.900020,-0.000020,0.000020
1266413,17.800000,17.799982,0.000018,0.000018
2783444,17.433333,17.433338,-0.000004,0.000004


In [16]:
with open("../models/intro_lm.bin", "wb") as f_out:
    pickle.dump((dv, lm), f_out)

Ridge Regression

In [17]:
with mlflow.start_run():
    mlflow.set_tag("developer", value="Derek")
    mlflow.set_tag("model", value="ridge_regression")
    alpha = 0.03
    mlflow.log_param("alpha", alpha)

    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train, y_train)
    preds = ridge.predict(x_test)
    rmse = root_mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(
        f"The model has an average error of {rmse:.2f} minutes,while explaining {r2:.2f} of the variance"
    )
    mlflow.log_params(params=ridge.get_params())
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_artifact("../models/intro_ridge.bin", "model")

2024/10/28 15:50:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run efficient-tern-750 at: http://127.0.0.1:5000/#/experiments/1/runs/9cf13a3510f34780b0c51007ca3d022e.
2024/10/28 15:50:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


The model has an average error of 11.08 minutes,while explaining 0.02 of the variance


In [18]:
with open("../models/intro_ridge.bin", "wb") as f_out:
    pickle.dump((dv, ridge), f_out)

Lasso Regression

In [19]:
with mlflow.start_run():
    mlflow.set_tag("developer", value="Derek")
    mlflow.set_tag("model", value="lasso_regression")
    alpha = 0.03
    mlflow.log_param("alpha", alpha)

    lasso = Lasso(alpha=alpha)
    lasso.fit(x_train, y_train)
    preds = lasso.predict(x_test)
    rmse = root_mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(
        f"The model has an average error of {rmse:.2f} minutes,while explaining {r2:.2f} of the variance"
    )
    mlflow.log_params(params=lasso.get_params())
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_artifact("../models/intro_lasso.bin", "model")

2024/10/28 15:50:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run upset-carp-263 at: http://127.0.0.1:5000/#/experiments/1/runs/90ffd292c7884eb4ac8f2c5383958b14.
2024/10/28 15:50:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


The model has an average error of 11.08 minutes,while explaining 0.02 of the variance


In [20]:
with open("../models/intro_lasso.bin", "wb") as f_out:
    pickle.dump((dv, lasso), f_out)

# Hyperparameter tuning with mlflow and hyperopt

In [21]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope

In [22]:
train = xgb.DMatrix(x_train, label=y_train)
valid = xgb.DMatrix(x_test, label=y_test)

objective function for xgboost

In [23]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50,
        )
        preds = booster.predict(valid)
        rmse = np.sqrt(root_mean_squared_error(y_test, preds))
        mlflow.log_metric("rmse", rmse)

    return {"loss": rmse, "status": STATUS_OK}

search space
area that hyperopt will search for the best parameters
all are xgboost parameters
read more here: https://hyperopt.github.io/hyperopt/getting-started/search_spaces/#parameter-expressions
and here: https://xgboost.readthedocs.io/en/latest/parameter.html

In [24]:
search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_child_weight": scope.int(hp.loguniform("min_child_weight", -1, 3)),
    "objective": "reg:squarederror",
    "seed": 192,
}
# each one is a range that hyperopt will use

optimisation using fmin
minimises the metric provided using the search space

In [25]:
best_result = fmin(
    objective, space=search_space, algo=tpe.suggest, max_evals=10, trials=Trials()
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

[0]	validation-rmse:7.67575                           
[1]	validation-rmse:6.85029                           
[2]	validation-rmse:6.68857                           
[3]	validation-rmse:6.65806                           
[4]	validation-rmse:6.65172                           
[5]	validation-rmse:6.65004                           
[6]	validation-rmse:6.64926                           
[7]	validation-rmse:6.64901                           
[8]	validation-rmse:6.64880                           
[9]	validation-rmse:6.64860                           
[10]	validation-rmse:6.64856                          
[11]	validation-rmse:6.64848                          
[12]	validation-rmse:6.64846                          
[13]	validation-rmse:6.64847                          
[14]	validation-rmse:6.64843                          
[15]	validation-rmse:6.64839                          
[16]	validation-rmse:6.64838                          
[17]	validation-rmse:6.64836                          
[18]	valid

2024/10/28 15:51:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run classy-flea-640 at: http://127.0.0.1:5000/#/experiments/1/runs/7aa1223cd0de4c2081d417622fb5c2f0.

2024/10/28 15:51:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:9.99029                                                    
[1]	validation-rmse:9.07122                                                    
[2]	validation-rmse:8.38424                                                    
[3]	validation-rmse:7.87948                                                    
[4]	validation-rmse:7.51430                                                    
[5]	validation-rmse:7.25359                                                    
[6]	validation-rmse:7.06944                                                    
[7]	validation-rmse:6.94041                                                    
[8]	validation-rmse:6.85055                                                    
[9]	validation-rmse:6.78822                                                    
[10]	validation-rmse:6.74512                                                   
[11]	validation-rmse:6.71535                                                   
[12]	validation-rmse:6.69482            

2024/10/28 15:52:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run big-quail-450 at: http://127.0.0.1:5000/#/experiments/1/runs/a4dd402a92d84c90b17225ad40d5bd08.

2024/10/28 15:52:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:6.98972                                                    
[1]	validation-rmse:6.67265                                                    
[2]	validation-rmse:6.65256                                                    
[3]	validation-rmse:6.65037                                                    
[4]	validation-rmse:6.64944                                                    
[5]	validation-rmse:6.64906                                                    
[6]	validation-rmse:6.64866                                                    
[7]	validation-rmse:6.64863                                                    
[8]	validation-rmse:6.64845                                                    
[9]	validation-rmse:6.64837                                                    
[10]	validation-rmse:6.64829                                                   
[11]	validation-rmse:6.64823                                                   
[12]	validation-rmse:6.64821            

2024/10/28 15:53:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run melodic-pig-158 at: http://127.0.0.1:5000/#/experiments/1/runs/99c2c5fe36d04def99e8741c95855b9e.

2024/10/28 15:53:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:7.05456                                                    
[1]	validation-rmse:6.67849                                                    
[2]	validation-rmse:6.65114                                                    
[3]	validation-rmse:6.64895                                                    
[4]	validation-rmse:6.64872                                                    
[5]	validation-rmse:6.64869                                                    
[6]	validation-rmse:6.64868                                                    
[7]	validation-rmse:6.64868                                                    
[8]	validation-rmse:6.64868                                                    
[9]	validation-rmse:6.64868                                                    
[10]	validation-rmse:6.64868                                                   
[11]	validation-rmse:6.64868                                                   
[12]	validation-rmse:6.64868            

2024/10/28 15:53:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run delightful-dog-72 at: http://127.0.0.1:5000/#/experiments/1/runs/a710f800d6fa41e3857920ff1e3bac46.

2024/10/28 15:53:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:10.57853                                                   
[1]	validation-rmse:10.03335                                                   
[2]	validation-rmse:9.55588                                                    
[3]	validation-rmse:9.13961                                                    
[4]	validation-rmse:8.77690                                                    
[5]	validation-rmse:8.46331                                                    
[6]	validation-rmse:8.19281                                                    
[7]	validation-rmse:7.96063                                                    
[8]	validation-rmse:7.76128                                                    
[9]	validation-rmse:7.59142                                                    
[10]	validation-rmse:7.44621                                                   
[11]	validation-rmse:7.32273                                                   
[12]	validation-rmse:7.21758            

2024/10/28 15:56:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run skillful-boar-996 at: http://127.0.0.1:5000/#/experiments/1/runs/8e918dc6a366404d801d44c306a4e6f7.

2024/10/28 15:56:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:10.69592                                                   
[1]	validation-rmse:10.24176                                                   
[2]	validation-rmse:9.83231                                                    
[3]	validation-rmse:9.46414                                                    
[4]	validation-rmse:9.13394                                                    
[5]	validation-rmse:8.83859                                                    
[6]	validation-rmse:8.57508                                                    
[7]	validation-rmse:8.34059                                                    
[8]	validation-rmse:8.13244                                                    
[9]	validation-rmse:7.94810                                                    
[10]	validation-rmse:7.78521                                                   
[11]	validation-rmse:7.64160                                                   
[12]	validation-rmse:7.51522            

2024/10/28 15:59:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run colorful-bear-173 at: http://127.0.0.1:5000/#/experiments/1/runs/e8afca84f969400d8b093bc4339378bc.

2024/10/28 15:59:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:9.96958                                                     
[1]	validation-rmse:9.04007                                                     
[2]	validation-rmse:8.34973                                                     
[3]	validation-rmse:7.84607                                                     
[4]	validation-rmse:7.48443                                                     
[5]	validation-rmse:7.22827                                                     
[6]	validation-rmse:7.04876                                                     
[7]	validation-rmse:6.92399                                                     
[8]	validation-rmse:6.83778                                                     
[9]	validation-rmse:6.77845                                                     
[10]	validation-rmse:6.73773                                                    
[11]	validation-rmse:6.70983                                                    
[12]	validation-rmse:6.69071

2024/10/28 16:00:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run nebulous-bear-390 at: http://127.0.0.1:5000/#/experiments/1/runs/6f8be46908ca46f289506b012a4d270e.

2024/10/28 16:00:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:8.58805                                                     
[1]	validation-rmse:7.81390                                                     
[2]	validation-rmse:7.47299                                                     
[3]	validation-rmse:7.26651                                                     
[4]	validation-rmse:7.15539                                                     
[5]	validation-rmse:7.09309                                                     
[6]	validation-rmse:6.96779                                                     
[7]	validation-rmse:6.92849                                                     
[8]	validation-rmse:6.87633                                                     
[9]	validation-rmse:6.85077                                                     
[10]	validation-rmse:6.83407                                                    
[11]	validation-rmse:6.82130                                                    
[12]	validation-rmse:6.80613

2024/10/28 16:01:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-moth-750 at: http://127.0.0.1:5000/#/experiments/1/runs/b0f86be990c14da699746c6ac50e90bf.

2024/10/28 16:01:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:9.41495                                                     
[1]	validation-rmse:8.27887                                                    
[2]	validation-rmse:7.58574                                                    
[3]	validation-rmse:7.17808                                                    
[4]	validation-rmse:6.94467                                                    
[5]	validation-rmse:6.81331                                                    
[6]	validation-rmse:6.74014                                                    
[7]	validation-rmse:6.69954                                                    
[8]	validation-rmse:6.67706                                                    
[9]	validation-rmse:6.66461                                                    
[10]	validation-rmse:6.65768                                                   
[11]	validation-rmse:6.65382                                                   
[12]	validation-rmse:6.65165           

2024/10/28 16:01:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run thoughtful-crab-956 at: http://127.0.0.1:5000/#/experiments/1/runs/71c1a788771444d6af944ed301d22f00.

2024/10/28 16:01:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



[0]	validation-rmse:10.75662                                                   
[1]	validation-rmse:10.35328                                                   
[2]	validation-rmse:9.98597                                                    
[3]	validation-rmse:9.65170                                                    
[4]	validation-rmse:9.34846                                                    
[5]	validation-rmse:9.07341                                                    
[6]	validation-rmse:8.82502                                                    
[7]	validation-rmse:8.60079                                                    
[8]	validation-rmse:8.39828                                                    
[9]	validation-rmse:8.21562                                                    
[10]	validation-rmse:8.05151                                                   
[11]	validation-rmse:7.90472                                                   
[12]	validation-rmse:7.77314            

2024/10/28 16:04:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run youthful-pig-53 at: http://127.0.0.1:5000/#/experiments/1/runs/c61feffd27d8425e9f2e634d0f858512.

2024/10/28 16:04:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.



100%|██████████| 10/10 [13:58<00:00, 83.85s/trial, best loss: 2.578424661145273]


In [26]:
best_result

{'learning_rate': np.float64(0.7642238261917301),
 'max_depth': np.float64(89.0),
 'min_child_weight': np.float64(12.978126207985504),
 'reg_alpha': np.float64(0.046390865549832794),
 'reg_lambda': np.float64(0.03898891619999199)}

In [28]:
best_result["max_depth"] = int(best_result["max_depth"])

best model

In [29]:
with mlflow.start_run():
    mlflow.set_tag("model", "xgboost")
    mlflow.log_params(best_result)

    booster = xgb.train(
        params=best_result,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50,
    )

[0]	validation-rmse:6.99003
[1]	validation-rmse:6.67306
[2]	validation-rmse:6.65297
[3]	validation-rmse:6.65064
[4]	validation-rmse:6.64966
[5]	validation-rmse:6.64923
[6]	validation-rmse:6.64896
[7]	validation-rmse:6.64885
[8]	validation-rmse:6.64866
[9]	validation-rmse:6.64857
[10]	validation-rmse:6.64842
[11]	validation-rmse:6.64833
[12]	validation-rmse:6.64825
[13]	validation-rmse:6.64830
[14]	validation-rmse:6.64825
[15]	validation-rmse:6.64818
[16]	validation-rmse:6.64812
[17]	validation-rmse:6.64814
[18]	validation-rmse:6.64813
[19]	validation-rmse:6.64810
[20]	validation-rmse:6.64809
[21]	validation-rmse:6.64812
[22]	validation-rmse:6.64809
[23]	validation-rmse:6.64808
[24]	validation-rmse:6.64809
[25]	validation-rmse:6.64806
[26]	validation-rmse:6.64807
[27]	validation-rmse:6.64806
[28]	validation-rmse:6.64810
[29]	validation-rmse:6.64808
[30]	validation-rmse:6.64807
[31]	validation-rmse:6.64805
[32]	validation-rmse:6.64810
[33]	validation-rmse:6.64807
[34]	validation-rmse:6.6

2024/10/28 16:06:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run bald-worm-469 at: http://127.0.0.1:5000/#/experiments/1/runs/922cbeea08dd404ebf31e87789ed3470.
2024/10/28 16:06:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


auto-logging
read more here: https://mlflow.org/docs/latest/tracking.html#automatic-logging

In [30]:
best_result["max_depth"] = int(best_result["max_depth"])

In [31]:
mlflow.xgboost.autolog(disable=True)

In [32]:
mlflow.xgboost.autolog()
booster = xgb.train(
    params=best_result,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, "validation")],
    early_stopping_rounds=50,
)

2024/10/28 16:07:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e47a325749af45219c77c78cfbcee18e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:6.99003
[1]	validation-rmse:6.67306
[2]	validation-rmse:6.65297
[3]	validation-rmse:6.65064
[4]	validation-rmse:6.64966
[5]	validation-rmse:6.64923
[6]	validation-rmse:6.64896
[7]	validation-rmse:6.64885
[8]	validation-rmse:6.64866
[9]	validation-rmse:6.64857
[10]	validation-rmse:6.64842
[11]	validation-rmse:6.64833
[12]	validation-rmse:6.64825
[13]	validation-rmse:6.64830
[14]	validation-rmse:6.64825
[15]	validation-rmse:6.64818
[16]	validation-rmse:6.64812
[17]	validation-rmse:6.64814
[18]	validation-rmse:6.64813
[19]	validation-rmse:6.64810
[20]	validation-rmse:6.64809
[21]	validation-rmse:6.64812
[22]	validation-rmse:6.64809
[23]	validation-rmse:6.64808
[24]	validation-rmse:6.64809
[25]	validation-rmse:6.64806
[26]	validation-rmse:6.64807
[27]	validation-rmse:6.64806
[28]	validation-rmse:6.64810
[29]	validation-rmse:6.64808
[30]	validation-rmse:6.64807
[31]	validation-rmse:6.64805
[32]	validation-rmse:6.64810
[33]	validation-rmse:6.64807
[34]	validation-rmse:6.6

2024/10/28 16:07:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-koi-987 at: http://127.0.0.1:5000/#/experiments/1/runs/e47a325749af45219c77c78cfbcee18e.
2024/10/28 16:07:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [33]:
mlflow.xgboost.autolog(disable=True)

In [38]:
with mlflow.start_run():
    booster = xgb.train(
        params=best_result,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50,
    )
    mlflow.log_params(params=best_result)
    preds = booster.predict(valid)
    rmse = np.sqrt(root_mean_squared_error(y_test, preds))
    mlflow.log_metric("rmse", rmse)
    mlflow.set_tag("model", "xgboost")
    mlflow.xgboost.log_model(booster, artifact_path="models")
    with open("../models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("../models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:6.99003
[1]	validation-rmse:6.67306
[2]	validation-rmse:6.65297
[3]	validation-rmse:6.65064
[4]	validation-rmse:6.64966
[5]	validation-rmse:6.64923
[6]	validation-rmse:6.64896
[7]	validation-rmse:6.64885
[8]	validation-rmse:6.64866
[9]	validation-rmse:6.64857
[10]	validation-rmse:6.64842
[11]	validation-rmse:6.64833
[12]	validation-rmse:6.64825
[13]	validation-rmse:6.64830
[14]	validation-rmse:6.64825
[15]	validation-rmse:6.64818
[16]	validation-rmse:6.64812
[17]	validation-rmse:6.64814
[18]	validation-rmse:6.64813
[19]	validation-rmse:6.64810
[20]	validation-rmse:6.64809
[21]	validation-rmse:6.64812
[22]	validation-rmse:6.64809
[23]	validation-rmse:6.64808
[24]	validation-rmse:6.64809
[25]	validation-rmse:6.64806
[26]	validation-rmse:6.64807
[27]	validation-rmse:6.64806
[28]	validation-rmse:6.64810
[29]	validation-rmse:6.64808
[30]	validation-rmse:6.64807
[31]	validation-rmse:6.64805
[32]	validation-rmse:6.64810
[33]	validation-rmse:6.64807
[34]	validation-rmse:6.6

2024/10/28 16:19:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run shivering-stag-340 at: http://127.0.0.1:5000/#/experiments/1/runs/8f80585992ee47cfb3a3a7f19ccc4ab4.
2024/10/28 16:19:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


In [39]:
logged_model = "runs:/730ce618f1d5499590a3008b364a0c14/model"

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.xgboost
  run_id: 730ce618f1d5499590a3008b364a0c14

In [40]:
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model
# load as a xgboost model

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

<xgboost.core.Booster at 0x7939c6239970>

In [None]:
preds = xgboost_model.predict(valid)