In [None]:
import mlflow
import os
import pandas as pd
import haversine as hs

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [None]:
# setup mlflow 
os.environ["AWS_PROFILE"] = "default"
TRACKING_SERVER_HOST = os.environ.get("TRACKING_SERVER_HOST")
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [None]:
def read_data(path: str, date_columns: list[int], header_col:int = 0) -> pd.DataFrame:
    return pd.read_csv(path, header=header_col, parse_dates=date_columns)

In [None]:
def preprocess_data(ride_df: pd.DataFrame, station_df: pd.DataFrame)-> pd.DataFrame:
    
    # merge to get the start station details
    start_station_df = ride_df.merge(station_df, left_on="emplacement_pk_start", right_on="pk", how="inner")
    # rename the lattitude and longitude of the start station
    start_station_df.rename(columns={"latitude":"st_lattitude", "longitude":"st_longitude"}, inplace=True)
    # filter out the unneccesary columns
    start_station_df = start_station_df[["emplacement_pk_start", "emplacement_pk_end", "duration_sec", "is_member", "st_lattitude", "st_longitude"]]
    
    # merge to get the end stations detailes
    end_station_df = start_station_df.copy().merge(station_df, left_on="emplacement_pk_end", right_on="pk", how="inner")
    # rename the lattitude and longitude of the start station
    end_station_df.rename(columns={"latitude":"end_lattitude", "longitude":"end_longitude"}, inplace=True)
    # filter out the unneccesary columns
    end_station_df = end_station_df[["emplacement_pk_start", "emplacement_pk_end", "st_lattitude", "st_longitude", "end_lattitude", "end_longitude", "is_member", "duration_sec"]]
    
    # drop the rows with missing values
    end_station_df.dropna(inplace=True)
    
    # calculate the distance of the trip 
    end_station_df["distance_km"] = end_station_df.apply(lambda row: hs.haversine((row["st_lattitude"], row["st_longitude"]), (row["end_lattitude"], row["end_longitude"]), unit="km"), axis=1)
    
    # create a pair with start and end station
    end_station_df["ride_stations"] = end_station_df[["emplacement_pk_start", "emplacement_pk_end"]].astype(str).apply(lambda x: '_'.join(x), axis=1)
    # convert the duration to minute
    end_station_df["duration_minute"] = end_station_df["duration_sec"]/60
    
    # select the final columns
    processed_df = end_station_df[["ride_stations", "distance_km", "is_member", "duration_minute"]]
    # convert the categorical column to string
    processed_df["is_member"] = processed_df["is_member"].astype(str)
    
    return processed_df

In [None]:
def generate_features(input_df: pd.DataFrame, target_column: str, vectorizer: DictVectorizer, fit_vectorizer = True):
    
    feature_columns = input_df.columns.to_list()
    feature_columns.remove(target_column)
    # crate a data frame with train columns
    feature_df = input_df[feature_columns]
    
    # convert the data frame as a dictionary 
    feature_dicts = feature_df.to_dict(orient='records')
    # vectorize the training data

    if fit_vectorizer:
        X = vectorizer.fit_transform(feature_dicts)
    else:
        X = vectorizer.transform(feature_dicts)
    
    y = input_df[target_column].values
    
    return (vectorizer, X, y)

In [None]:
def build_model(X, y, alpha):
    
    # initialize model
    lasso_regressor = Lasso(alpha)
    
    # perform training
    lasso_regressor.fit(X, y)
    
    return lasso_regressor

In [None]:
def train_and_register_model(train_ride_path: str, 
                             train_station_path: str,
                             valid_ride_path: str,
                             valid_station_path: str,
                             vectorizer: str,
                             exp_name: str="bixi_ride_duration_prediction", 
                             developer_name: str="Mahmudul Hasan Bhuiyan"):
    
    # read train data
    print(f"Reading training rides data from: {train_ride_path}")
    train_ride_df = read_data(train_ride_path, [0, 2], 0)
    # sample a small portion of the rides to reduce runtime
    train_ride_df = train_ride_df.sample(frac=0.1, random_state=1, ignore_index=True)
    print(f"Length of training ride df: {len(train_ride_df)}")
    
    print(f"Reading training stations data from: {train_station_path}")
    train_stations_df = read_data(train_station_path, [], 0)
    print(f"Length of training stations df: {len(train_stations_df)}")
    
    # read validation data
    print(f"Reading validation rides data from: {valid_ride_path}")
    valid_ride_df = read_data(valid_ride_path, [0, 2], 0)
    # sample a small portion of the rides to reduce runtime
    valid_ride_df = valid_ride_df.sample(frac=0.1, random_state=1, ignore_index=True)
    print(f"Length of validation  ride df: {len(valid_ride_df)}")
    
    print(f"Reading validation stations data from: {valid_station_path}")
    valid_stations_df = read_data(valid_station_path, [], 0)
    print(f"Length of train stations df: {len(valid_stations_df)}")
    
    
    # preprocess data
    print("Preprocessing data")
    train_preprocessed_df = preprocess_data(train_ride_df, train_stations_df)
    print(f"Length of training preprocessed df: {len(train_preprocessed_df)}")
    
    valid_preprocessed_df = preprocess_data(valid_ride_df, valid_stations_df)
    print(f"Length of validation preprocessed df: {len(valid_preprocessed_df)}")
    
    
    # generate features
    print("Generating features")
    
    dict_vectorizer, X_train, y_train = generate_features(input_df=train_preprocessed_df, 
                                                          target_column="duration_minute", 
                                                          vectorizer=vectorizer)
    dict_vectorizer, X_val, y_val = generate_features(input_df=valid_preprocessed_df,
                                                      target_column="duration_minute",
                                                      vectorizer=dict_vectorizer,
                                                      fit_vectorizer = False)
    
    
    with mlflow.start_run():
        
        # set the experiment name
        mlflow.set_experiment(exp_name)
        mlflow.set_tag("developer", developer_name)
        
        # log the parameters in mlflow
        mlflow.log_param("train-ride-data-path", train_ride_path)
        mlflow.log_param("train-stations-data-path", train_station_path)
        mlflow.log_param("valid-ride-data-path", valid_ride_path)
        mlflow.log_param("valid-stations-data-path", valid_station_path)

        alpha = 0.1
        mlflow.log_param("alpha", alpha)
        
        print("Building the model")
        # build model
        model = build_model(X_train, y_train, alpha)
        
        # runnning prediction on the validation data
        print("Evaluating model on the validation data")
        y_pred = model.predict(X_val)
        
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        print(f"RMSE on the validation data: {rmse}")
        
        # log the model and the artifact
        mlflow.sklearn.log_model(model, artifact_path="models")
        print("Logged the model in artifacts")
        print(f"Artifacts URI: '{mlflow.get_artifact_uri()}'")

# Perform the training
train_ride_path = "../data/2022-05-01/20220105_donnees_ouvertes.csv"
train_station_path = "../data/2022-05-01/20220105_stations.csv"
valid_ride_path = "../data/2022-06-01/20220106_donnees_ouvertes.csv"
valid_station_path = "../data/2022-06-01/20220106_stations.csv"
vectorizer = DictVectorizer()

train_and_register_model(train_ride_path, train_station_path, valid_ride_path, valid_station_path, vectorizer)

# Validation

In [None]:
#Read data 
valid_ride_path = "../data/2022-07-01/20220107_donnees_ouvertes.csv"
valid_stations_path = "../data/2022-07-01/20220107_stations.csv"
valid_ride_df = read_data(valid_ride_path, [0, 2], 0)
valid_stations_df = read_data(valid_stations_path, [], 0)
print(f"Length of ride df: {len(valid_ride_df)}")
print(f"Length of stations df: {len(valid_stations_df)}")

In [None]:
# Preprocessed data
valid_preprocessed_df = preprocess_data(valid_ride_df, valid_stations_df)
print(f"Length of preprocessed  df: {len(valid_preprocessed_df)}")
valid_preprocessed_df.head(10)

In [None]:
# generate features
valid_df = valid_preprocessed_df[['ride_stations', 'distance_km', 'is_member']]
valid_dicts = valid_df.to_dict(orient='records')
X_valid  = dict_vectorizer.transform(valid_dicts)
y_valid = valid_preprocessed_df["duration_minute"].values

In [None]:
# build model
y_pred = model.predict(X_valid)
valid_error = mean_squared_error(y_valid, y_pred, squared=False)
print(f"Validation error: {valid_error}")