In [1]:
!python -V

Python 3.12.3


In [2]:
import pandas as pd
import numpy as np
import pickle

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nyc-taxi-experiment")

2024/06/10 18:02:39 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/235990381873781123', creation_time=1718042559014, experiment_id='235990381873781123', last_update_time=1718042559014, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

artifact_location='/workspaces/MLOpps24/02-experiment-tracking/mlruns/1': Provides the artifact location in the MLFlow URI


In [7]:
df1 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
df2 = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

## Q1. Downloading the data

We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records".

Download the data for January and February 2022.

Read the data for January. How many columns are there?

In [8]:
print("There are a total of", len(df1.columns),"columns.")

There are a total of 19 columns.


## Q2. Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the standard deviation of the trips duration in January?

    32.59
    42.59
    52.59
    62.59

    

In [9]:
df1["duration"] = df1["tpep_dropoff_datetime"] - df1['tpep_pickup_datetime']
df1.duration = df1.duration.apply(lambda td: td.total_seconds() / 60)
df1['PUMonth'] = df1['tpep_pickup_datetime'].dt.month 


In [10]:
df1.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,PUMonth
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,...,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,8.433333,1
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,...,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,6.316667,1
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,...,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,12.75,1
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,...,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,9.616667,1
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,...,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,10.833333,1


In [11]:
df1 = df1[df1.PUMonth==1]

dur_std = df1['duration'].std() 

print("The Standard Deviation of Trip Duration is", round(dur_std,2) )

The Standard Deviation of Trip Duration is 42.59


## Q3. Dropping outliers

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [12]:
df1_filter = df1[(df1.duration >= 1) & (df1.duration <= 60)]

In [13]:
print("The original dataset was",len(df1),"rows, and the newer dataset was", len(df1_filter),"rows. So we are left with",round(len(df1_filter)/len(df1),2)*100 ,"percent of the original dataset.")

The original dataset was 3066718 rows, and the newer dataset was 3009136 rows. So we are left with 98.0 percent of the original dataset.


## Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

    Turn the dataframe into a list of dictionaries
    Fit a dictionary vectorizer
    Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

    2
    155
    345
    515
    715


In [14]:
# Convert columns to 'category' if they have a limited number of unique values
df1_filter[["PULocationID","DOLocationID"]] = df1_filter[["PULocationID","DOLocationID"]].astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_filter[["PULocationID","DOLocationID"]] = df1_filter[["PULocationID","DOLocationID"]].astype(str)


In [15]:

q4_one_hot = df1_filter[["PULocationID", "DOLocationID"]]
q4_one_hot = q4_one_hot.to_dict(orient = "records")
v = DictVectorizer()


In [16]:
X = v.fit_transform(q4_one_hot)

In [17]:
X.shape

(3009136, 515)

## Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

    Train a plain linear regression model with default parameters
    Calculate the RMSE of the model on the training data

What's the RMSE on train?



In [18]:
X_train = X
y_train = df1_filter['duration'].values

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

In [19]:
mean_squared_error(y_train, y_pred, squared=False)



7.649229612202591

## Put the functions to read in the data all together. 

In [20]:
def taxi_data_prep(filename, month, low_bnd,upp_bnd):
     df = pd.read_parquet(filename)
     # Start prep of duration
     df["duration"] = df["tpep_dropoff_datetime"] - df['tpep_pickup_datetime']
     df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
     df['PUMonth'] = df['tpep_pickup_datetime'].dt.month 

     # Filter out the month and durations
     df = df[df.PUMonth==month]
     df_filter = df[(df.duration >= low_bnd) & (df.duration <= upp_bnd)]
     # Convert columns to 'category' if they have a limited number of unique values
     df_filter[["PULocationID","DOLocationID"]] = df_filter[["PULocationID","DOLocationID"]].astype(str)

     return df_filter

In [21]:
df_train = taxi_data_prep("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet",
                1, 1,60)
df_val = taxi_data_prep("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet",
                2, 1,60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filter[["PULocationID","DOLocationID"]] = df_filter[["PULocationID","DOLocationID"]].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filter[["PULocationID","DOLocationID"]] = df_filter[["PULocationID","DOLocationID"]].astype(str)


## Q6. Evaluating the model

Now let's apply this model to the validation dataset (February 2023).

What's the RMSE on validation?

In [22]:
v = DictVectorizer()

# Create one hot encoding from df
categorical = ["PULocationID", "DOLocationID"]
train_dicts = df_train[categorical].to_dict(orient = "records")
val_dicts = df_val[categorical].to_dict(orient = "records")

X_train = v.fit_transform(train_dicts)
X_val = v.transform(val_dicts)

In [23]:
# Statistical Model Prep
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [24]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



7.811825363985524

# Save output and model into the model folder

In [25]:
with open('models/lin_reg.bin','wb') as f_out:
    pickle.dump((v,lr),f_out)

# What do we do to update this? 

We can start tracking a new history using MLFlow
- Features like `mlflow.start_run()` can be incredibly helpful in this process. Let's try to update our model by using lasso. 

In [26]:
lasso = Lasso(0.1)
lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_val)

mean_squared_error(y_val, y_pred, squared = False) 



8.75055890627609

This did a bit worse so let's try to update this with MLFlow's Processes!

In [27]:
with mlflow.start_run():
    # Log developer
    mlflow.set_tag("developer", "Emmanuel")
    # Set training and validation data pths 
    mlflow.log_param("train-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
    mlflow.log_param("valid-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")
    
    alpha = 0.01
    # Log alpha parameters to evaluate model effectiveness
    mlflow.log_param("alpha", alpha)
    lasso = Lasso(alpha)
    lasso.fit(X_train, y_train)
    
    y_pred = lasso.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared = False)
    mlflow.log_metric("rmse", rmse)



# Hyperparameter Optimization with XGBoost 



In [28]:
import xgboost as xgb

# fMin is the object that minimizes the error obkects 
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [29]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [30]:
def objective(params):
    """
    params: Set of parameters for XGBOOST that get logged to mlflow and generates booster object. 
    """
    with mlflow.start_run():
        mlflow.set_tag("model","xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            # max iterations 
            num_boost_round=20,
            # Validation set gets used to control the optimization algorithm
            evals=[(valid,"validation")], 
            # if there are 50 rounds without better optimization then it stops 
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
    return {'loss': rmse, 'status':STATUS_OK}


        

In [38]:
search_space = {
    # controls the depth of the trees: 4-100 levels; returns a real number but we want an integer
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0), # exp(-3), exp(0) - [0.05, 1]
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear', 
    'seed': 42
}

best_result = fmin(
    fn=objective, 
    space = search_space,
    algo = tpe.suggest,
    max_evals = 10, 
    trials = Trials()
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:9.23092                           
[1]	validation-rmse:8.61622                           
[2]	validation-rmse:7.94468                           
[3]	validation-rmse:7.62770                           
[4]	validation-rmse:7.38103                           
[5]	validation-rmse:6.98315                           
[6]	validation-rmse:6.86052                           
[7]	validation-rmse:6.58428                           
[8]	validation-rmse:6.51241                           
[9]	validation-rmse:6.45218                           
[10]	validation-rmse:6.40253                          
[11]	validation-rmse:6.21093                          
[12]	validation-rmse:6.18179                          
[13]	validation-rmse:6.14987                          
[14]	validation-rmse:6.12556                          
[15]	validation-rmse:6.09729                          
[16]	validation-rmse:5.95732                          
[17]	validation-rmse:5.94490                          
[18]	valid





[0]	validation-rmse:9.69812                                                    
[1]	validation-rmse:9.36905                                                    
[2]	validation-rmse:9.06308                                                    
[3]	validation-rmse:8.79746                                                    
[4]	validation-rmse:8.49455                                                    
[5]	validation-rmse:8.28544                                                    
[6]	validation-rmse:8.10547                                                    
[7]	validation-rmse:7.84334                                                    
[8]	validation-rmse:7.61690                                                    
[9]	validation-rmse:7.48434                                                    
[10]	validation-rmse:7.25731                                                   
[11]	validation-rmse:7.05891                                                   
[12]	validation-rmse:6.97282            





[0]	validation-rmse:7.87977                                                    
[1]	validation-rmse:7.49390                                                    
[2]	validation-rmse:7.00924                                                    
[3]	validation-rmse:6.88993                                                    
[4]	validation-rmse:6.74428                                                    
[5]	validation-rmse:6.63107                                                    
[6]	validation-rmse:6.56193                                                    
[7]	validation-rmse:6.52419                                                    
[8]	validation-rmse:6.46914                                                    
[9]	validation-rmse:6.31886                                                    
[10]	validation-rmse:6.27122                                                   
[11]	validation-rmse:6.25453                                                   
[12]	validation-rmse:6.13743            





[0]	validation-rmse:9.77578                                                     
[1]	validation-rmse:9.50435                                                     
[2]	validation-rmse:9.25998                                                     
[3]	validation-rmse:9.02416                                                     
[4]	validation-rmse:8.81748                                                     
[5]	validation-rmse:8.62311                                                     
[6]	validation-rmse:8.40615                                                     
[7]	validation-rmse:8.25098                                                     
[8]	validation-rmse:8.04128                                                     
[9]	validation-rmse:7.91303                                                     
[10]	validation-rmse:7.79693                                                    
[11]	validation-rmse:7.58415                                                    
[12]	validation-rmse:7.38914





[0]	validation-rmse:8.62086                                                     
[1]	validation-rmse:7.98359                                                     
[2]	validation-rmse:7.62616                                                     
[3]	validation-rmse:7.47036                                                     
[4]	validation-rmse:7.14608                                                     
[5]	validation-rmse:7.06964                                                     
[6]	validation-rmse:7.00934                                                     
[7]	validation-rmse:6.87710                                                     
[8]	validation-rmse:6.82932                                                     
[9]	validation-rmse:6.66968                                                     
[10]	validation-rmse:6.63249                                                    
[11]	validation-rmse:6.60886                                                    
[12]	validation-rmse:6.38934





[0]	validation-rmse:9.51005                                                     
[1]	validation-rmse:9.11655                                                     
[2]	validation-rmse:8.83924                                                     
[3]	validation-rmse:8.64357                                                     
[4]	validation-rmse:8.50466                                                     
[5]	validation-rmse:8.40168                                                     
[6]	validation-rmse:8.32992                                                     
[7]	validation-rmse:8.27462                                                     
[8]	validation-rmse:8.22871                                                     
[9]	validation-rmse:8.18698                                                     
[10]	validation-rmse:8.15959                                                    
[11]	validation-rmse:8.13263                                                    
[12]	validation-rmse:8.11321





[0]	validation-rmse:9.76065                                                     
[1]	validation-rmse:9.48096                                                     
[2]	validation-rmse:9.23062                                                     
[3]	validation-rmse:9.00354                                                     
[4]	validation-rmse:8.76658                                                     
[5]	validation-rmse:8.57612                                                     
[6]	validation-rmse:8.35105                                                     
[7]	validation-rmse:8.19133                                                     
[8]	validation-rmse:7.96053                                                     
[9]	validation-rmse:7.75296                                                     
[10]	validation-rmse:7.62713                                                    
[11]	validation-rmse:7.44033                                                    
[12]	validation-rmse:7.33383





[0]	validation-rmse:9.45616                                                     
[1]	validation-rmse:8.96434                                                     
[2]	validation-rmse:8.57249                                                     
[3]	validation-rmse:8.07139                                                     
[4]	validation-rmse:7.73790                                                     
[5]	validation-rmse:7.53643                                                     
[6]	validation-rmse:7.21754                                                     
[7]	validation-rmse:7.08868                                                     
[8]	validation-rmse:6.97319                                                     
[9]	validation-rmse:6.88937                                                     
[10]	validation-rmse:6.67195                                                    
[11]	validation-rmse:6.61692                                                    
[12]	validation-rmse:6.56762





[0]	validation-rmse:7.79024                                                     
[1]	validation-rmse:6.16139                                                     
[2]	validation-rmse:5.90409                                                     
[3]	validation-rmse:5.63844                                                     
[4]	validation-rmse:5.58555                                                     
[5]	validation-rmse:5.55679                                                     
[6]	validation-rmse:5.46957                                                     
[7]	validation-rmse:5.43459                                                     
[8]	validation-rmse:5.38297                                                     
[9]	validation-rmse:5.36628                                                     
[10]	validation-rmse:5.35458                                                    
[11]	validation-rmse:5.29096                                                    
[12]	validation-rmse:5.28606





[0]	validation-rmse:8.62129                                                    
[1]	validation-rmse:7.80006                                                    
[2]	validation-rmse:7.46710                                                    
[3]	validation-rmse:7.15149                                                    
[4]	validation-rmse:7.03220                                                    
[5]	validation-rmse:6.74913                                                    
[6]	validation-rmse:6.65424                                                    
[7]	validation-rmse:6.60712                                                    
[8]	validation-rmse:6.38764                                                    
[9]	validation-rmse:6.35330                                                    
[10]	validation-rmse:6.23807                                                   
[11]	validation-rmse:6.20902                                                   
[12]	validation-rmse:6.18369            




# Logging a model using mlflow

The simplest way to do it is by using `mlflow.log_artifact()`

In [41]:
#with mlflow.start_run():
    # Log developer
    mlflow.set_tag("developer", "Emmanuel")
    # Set training and validation data pths 
    mlflow.log_param("train-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
    mlflow.log_param("valid-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")
    
    alpha = 0.01
    # Log alpha parameters to evaluate model effectiveness
    mlflow.log_param("alpha", alpha)
    lasso = Lasso(alpha)
    lasso.fit(X_train, y_train)
    
    y_pred = lasso.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared = False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path = "models/lin_reg.bin", artifact_path="models/model_pickle")



In [45]:
with mlflow.start_run():
    
    best_params = {
        'learning_rate': 0.9585355369315604,
        'max_depth':30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed':42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round = 100, 
        evals=[(valid,'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse=mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse) 

    mlflow.xgboost.log_model(booster, artifact_path = "models_mlflow")



[0]	validation-rmse:7.57263
[1]	validation-rmse:7.06574
[2]	validation-rmse:6.79753
[3]	validation-rmse:6.67683
[4]	validation-rmse:6.42471
[5]	validation-rmse:6.33258
[6]	validation-rmse:6.13333
[7]	validation-rmse:6.08795
[8]	validation-rmse:5.93047
[9]	validation-rmse:5.88699
[10]	validation-rmse:5.86652
[11]	validation-rmse:5.63215
[12]	validation-rmse:5.52192
[13]	validation-rmse:5.47711
[14]	validation-rmse:5.45489
[15]	validation-rmse:5.42772
[16]	validation-rmse:5.39945
[17]	validation-rmse:5.36669
[18]	validation-rmse:5.33672
[19]	validation-rmse:5.32648
[20]	validation-rmse:5.31131
[21]	validation-rmse:5.29392
[22]	validation-rmse:5.28537
[23]	validation-rmse:5.27372
[24]	validation-rmse:5.26809
[25]	validation-rmse:5.26172
[26]	validation-rmse:5.25741
[27]	validation-rmse:5.25256
[28]	validation-rmse:5.24923
[29]	validation-rmse:5.24542
[30]	validation-rmse:5.24307
[31]	validation-rmse:5.24038
[32]	validation-rmse:5.23939
[33]	validation-rmse:5.23800
[34]	validation-rmse:5.2



# Model Registry with MLFlow

In [32]:
from mlflow.tracking import MlflowClient

MLFLOWTrackinguri = "http://127.0.0.1:5000"

client = MlflowClient(tracking_uri=MLFLOWTrackinguri)

In [33]:
client.list_artifacts()

client.create_experiment(name="my-cool-experiment")


TypeError: MlflowClient.list_artifacts() missing 1 required positional argument: 'run_id'

In [34]:
from mlflow.entities import ViewType

runs= client.search_runs(
    experiment_ids = '1',
    filter_string = "",
    run_view_type = ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)