In [1]:
import pandas as pd
import mlflow

In [2]:
data_path = "/workspaces/taxiML/data/green_tripdata_2021-01.parquet"

In [3]:
taxi_zones = pd.read_csv("../data/taxi_zone_lookup_coordinates.csv")

In [4]:
def preprocess(data):
    data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
    data['duration'] = data['duration'].apply(lambda x: x.total_seconds() / 60)
    data = data[(data['duration'] >= data['duration'].quantile(0.04)) & (data['duration'] <= data['duration'].quantile(0.98))]
    categorical = ['trip_type', 'PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    target = ['duration']
    data = data[numerical + categorical + target]
    data['trip_type'].fillna(data.trip_type.mode().item(), inplace=True)
    data = pd.merge(data, taxi_zones[['LocationID', 'latitude', 'longitude']], left_on='PULocationID', right_on='LocationID', how='left').rename(columns={"latitude": "PULat",
                                                                                                                                                   "longitude": "PULon"})
    data = pd.merge(data, taxi_zones[['LocationID', 'latitude', 'longitude']], left_on='DOLocationID', right_on='LocationID', how='left').rename(columns={"latitude": "DOLat",
                                                                                                                                                   "longitude": "DOLon"})
    data = data.drop(columns=["PULocationID", "DOLocationID", "LocationID_x", "LocationID_y"])
    data["PULat"].fillna(data["PULat"].mean(), inplace=True)
    data["PULon"].fillna(data["PULon"].mean(), inplace=True)
    data["DOLat"].fillna(data["DOLat"].mean(), inplace=True)
    data["DOLon"].fillna(data["DOLon"].mean(), inplace=True)
    onehot = pd.get_dummies(data['trip_type'], prefix="triptype").map(lambda x: 1 if x else 0)
    data = data.join(onehot).drop(columns=['trip_type'])
    data = data[(data['trip_distance'] >= data['trip_distance'].quantile(0.04)) & (data['trip_distance'] < data['trip_distance'].quantile(0.99))]
    
    for column in ['PULat', 'PULon', 'DOLat', "DOLon"]:
        data[column] = (data[column] - data[column].mean()) / data[column].std()

    for column in ['trip_distance', 'duration']:
        data[column] = (data[column] - data[column].min()) / (data[column].max() - data[column].min())
    
    return data[['PULat', 'PULon', 'DOLat', "DOLon", "trip_distance"]], data['duration']

In [5]:
mlflow.set_tracking_uri("sqlite:///../mlflow.db")
mlflow.set_experiment("taxi_experiment")

2025/01/16 23:31:39 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/01/16 23:31:39 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/workspaces/taxiML/02/mlruns/1', creation_time=1737070300194, experiment_id='1', last_update_time=1737070300194, lifecycle_stage='active', name='taxi_experiment', tags={}>

In [6]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error as rmse

In [8]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Majid")
    
    mlflow.log_param("train_data_path", "/workspaces/taxiML/data/green_tripdata_2021-01.parquet")
    mlflow.log_param("val_data_path", "/workspaces/taxiML/data/green_tripdata_2021-02.parquet")
    train_data = pd.read_parquet("/workspaces/taxiML/data/green_tripdata_2021-01.parquet")
    val_data = pd.read_parquet("/workspaces/taxiML/data/green_tripdata_2021-02.parquet")
    x_train, y_train = preprocess(train_data)
    x_val, y_val = preprocess(val_data)

    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha=alpha)
    lr.fit(x_train, y_train)

    y_pred = lr.predict(x_val)
    mlflow.log_metric("rmse", rmse(y_val, y_pred))
    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['trip_type'].fillna(data.trip_type.mode().item(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["PULat"].fillna(data["PULat"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec