In [7]:
import pandas as pd
from  sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [8]:
# Q1
!mlflow --version

mlflow, version 2.3.2


In [9]:
def getdata(color,year,month):
    link = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{color}_tripdata_{year}-{month:02}.parquet'
    data = pd.read_parquet(link)
    return data

In [10]:
color = 'green'
year = '2022'
months = [i for i in range(1,4)]
for month in months:
    df_taxi = getdata(color,year,month)
    df_taxi.to_parquet(f'data/{color}_tripdata_{year}-{month:02}.parquet',compression='gzip',index=False)


In [11]:
import os
import pickle
import click
import pandas as pd

from sklearn.feature_extraction import DictVectorizer


def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)


def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df


def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv


def run_data_prep(dataset = "green"):
    # Load parquet files
    df_train = read_dataframe(
        os.path.join("data", f"{dataset}_tripdata_2022-01.parquet")
    )
    df_val = read_dataframe(
        os.path.join("data", f"{dataset}_tripdata_2022-02.parquet")
    )
    df_test = read_dataframe(
        os.path.join("data", f"{dataset}_tripdata_2022-03.parquet")
    )

    # Extract the target
    target = 'tip_amount'
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    # Fit the DictVectorizer and preprocess data
    dv = DictVectorizer()
    X_train, dv = preprocess(df_train, dv, fit_dv=True)
    X_val, _ = preprocess(df_val, dv, fit_dv=False)
    X_test, _ = preprocess(df_test, dv, fit_dv=False)

    # Create dest_path folder unless it already exists
    os.makedirs("output", exist_ok=True)

    # Save DictVectorizer and datasets
    dump_pickle(dv, os.path.join("output", "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join("output", "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join("output", "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join("output", "test.pkl"))
run_data_prep()


In [15]:
# Q2 
import os
os.stat('output/dv.pkl').st_size

153660

In [24]:
#Q3
import os
with open('mlruns/0/e371845252da42a6bb23d7e6f81f0086/params/max_depth','r') as f:
    print(f.read())

10


In [95]:
#Q4
from mlflow.tracking import MlflowClient
client = MlflowClient(tracking_uri='sqlite:///mlflow.db')
result = client.search_runs(
    experiment_ids=2,
    filter_string="",
    max_results=1,
    order_by=["metrics.rmse ASC"]
)
print(result[0].data.metrics['rmse'])

2.449827329704216


In [88]:
result[0].data.metrics['rmse']

2.449827329704216

In [126]:
#Q5
from mlflow.tracking import MlflowClient
client = MlflowClient(tracking_uri='sqlite:///mlflow.db')
result = client.search_registered_models(
filter_string=''
)
print(result[0].latest_versions[0].tags['testrmse'])

2.2854691906481364


In [None]:
#Q5
# Version Number