In [35]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import os

**Question 1. Select the Tool**<br>
You can use the same tool you used when completing the module, or choose a different one for your homework.

What's the name of the orchestrator you chose? <br>
**Answer: Prefect**

Question 2. Version
What's the version of the orchestrator? <br>

import prefect <br>
*print(prefect.__version__)*
<br>
Answer: 3.4.3

Question 3. Creating a pipeline <br>
Let's read the March 2023 Yellow taxi trips data.

How many records did we load?

In [20]:
import pandas as pd
def load_data(file_path: str):
    df = pd.read_parquet(file_path)
    return df
file_path = './data/yellow_tripdata_2023-03.parquet'
df = load_data(file_path)

In [21]:
print("The number of records loaded is:", df.shape[0])

The number of records loaded is: 3403766


Question 4. Data preparation <br> Let's continue with pipeline creation. We will use the same logic for preparing the data we used previously. This is what we used (adjusted for yellow dataset):

In [22]:
def read_dataframe(filepath):
    df = pd.read_parquet(filepath)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df
filepath = "./data/yellow_tripdata_2023-03.parquet"
df = read_dataframe(filepath)
print("The size of the result is: ", df.shape[0])

The size of the result is:  3316216


In [23]:
df.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333


Question 5. Train a model <br>
We will now train a linear regression model using the same code as in homework 1.

Fit a dict vectorizer.
Train a linear regression with default parameters.
Use pick up and drop off locations separately, don't create a combination feature.
Let's now use it in the pipeline. We will need to create another transformation block, and return both the dict vectorizer and the model.

What's the intercept of the model?

Hint: print the intercept_ field in the code block

In [31]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)
train_dicts = df[categorical + numerical].to_dict(orient='records')

In [32]:
# Fit the DictVectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Prepare target variable
target = 'duration'
y_train = df[target].values

# Train the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Print the intercept (this answers your question)
print(f"Model intercept: {lr.intercept_}")

Model intercept: 23.84803295641953


Question 6. Register the model <br>
The model is trained, so let's save it with MLFlow.

Find the logged model, and find MLModel file. What's the size of the model? (model_size_bytes field):

In [36]:
os.makedirs("models", exist_ok=True)    

In [None]:
mlflow.log_model(lr, artifact_path="models_mlflow")

In [37]:
with mlflow.start_run():
    # Save the model
    with open("models/linear_regression.pkl", "wb") as f_out:
        pickle.dump(lr, f_out)
        
    mlflow.log_artifact("models/linear_regression.pkl", artifact_path="models")

    # Log the model
    mlflow.sklearn.log_model(lr, artifact_path="models_mlflow")

print("Model registered successfully")




Model registered successfully
