<a href="https://colab.research.google.com/github/Chukwuka1488/MLOps_DTC/blob/main/01-intro/taxi_trip_duration_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -V

Python 3.11.12


In [2]:
# !pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
# !rm -rf /usr/local/lib/python3.8/dist-packages/cupy*
# !pip install cupy-cuda11x

In [3]:
# import cudf

In [4]:
!pip install pyarrow
!pip install pandas
!pip install matplotlib
!pip install seaborn



In [31]:
import pandas as pd
# for parquet files
import pyarrow as pa
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import numpy as np

### Training a model

In [7]:
def load_data(filepath: str) -> pd.DataFrame:
    """
    Loads a Parquet file into a Pandas DataFrame.

    Args:
        filepath: The path to the Parquet file.

    Returns:
        A Pandas DataFrame.
    """
    df = pd.read_parquet(filepath)
    return df



In [8]:
def compute_duration(df: pd.DataFrame) -> pd.DataFrame:
    """
    Computes the 'duration' column in minutes from 'lpep_dropoff_datetime'
    and 'lpep_pickup_datetime' columns.

    Args:
        df: The input DataFrame with 'lpep_dropoff_datetime' and 'lpep_pickup_datetime'.

    Returns:
        A DataFrame with the 'duration' column added.
    """
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df




In [9]:
def filter_duration_outliers(df: pd.DataFrame, min_duration: float, max_duration: float) -> pd.DataFrame:
    """
    Filters the DataFrame to keep records where 'duration' is within a specified range.

    Args:
        df: The input DataFrame with a 'duration' column.
        min_duration: The minimum acceptable duration (inclusive).
        max_duration: The maximum acceptable duration (inclusive).

    Returns:
        A DataFrame with 'duration' outliers removed.
    """
    df_filtered = df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)].copy()
    return df_filtered



In [10]:
def calculate_fraction_remaining(original_df: pd.DataFrame, filtered_df: pd.DataFrame) -> float:
    """
    Calculates the fraction of records remaining after filtering.

    Args:
        original_df: The DataFrame before filtering.
        filtered_df: The DataFrame after filtering.

    Returns:
        The fraction of records remaining.
    """
    total_records_before = len(original_df)
    total_records_after = len(filtered_df)
    fraction_remaining = total_records_after / total_records_before
    return fraction_remaining



In [11]:
def prepare_features_train(df: pd.DataFrame, categorical_cols: list, numerical_cols: list):
    """
    Prepares features for model training by converting categorical columns to string
    and applying DictVectorizer. It fits the DictVectorizer.

    Args:
        df: The input DataFrame (training data).
        categorical_cols: A list of column names to treat as categorical.
        numerical_cols: A list of column names to treat as numerical.

    Returns:
        A tuple containing:
            - X_transformed: The feature matrix (sparse matrix).
            - dv: The fitted DictVectorizer object.
    """
    df[categorical_cols] = df[categorical_cols].astype(str)
    train_dicts = df[categorical_cols + numerical_cols].to_dict(orient='records')
    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dicts)
    return X_train, dv

def prepare_features_val(df: pd.DataFrame, dv: DictVectorizer, categorical_cols: list, numerical_cols: list):
    """
    Prepares features for model validation by converting categorical columns to string
    and applying the already fitted DictVectorizer.

    Args:
        df: The input DataFrame (validation data).
        dv: The DictVectorizer fitted on the training data.
        categorical_cols: A list of column names to treat as categorical.
        numerical_cols: A list of column names to treat as numerical.

    Returns:
        X_transformed: The feature matrix (sparse matrix).
    """
    df[categorical_cols] = df[categorical_cols].astype(str)
    val_dicts = df[categorical_cols + numerical_cols].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    return X_val

In [12]:
def train_model(X_train, y_train: np.ndarray):
    """
    Trains a Linear Regression model.

    Args:
        X_train: The training feature matrix.
        y_train: The training target variable array.

    Returns:
        The trained LinearRegression model.
    """
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [13]:
def evaluate_model(model, X: np.ndarray, y_true: np.ndarray) -> tuple:
    """
    Evaluates a trained model by making predictions and calculating RMSE.

    Args:
        model: The trained model.
        X: The feature matrix for evaluation.
        y_true: The true target variable array.

    Returns:
        A tuple containing:
            - y_pred: The predicted target variable array.
            - rmse: The Root Mean Squared Error.
    """
    y_pred = model.predict(X)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return y_pred, rmse

In [14]:
def plot_duration_histogram(df: pd.DataFrame, column_name: str, title: str):
    """
    Generates and displays a histogram for a given column in a DataFrame.

    Args:
        df: The input DataFrame.
        column_name: The name of the column to plot.
        title: The title of the histogram.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column_name], bins=30, kde=True)
    plt.title(title)
    plt.xlabel(f'{column_name.replace("_", " ").title()} (minutes)')
    plt.ylabel('Frequency')
    plt.show()

In [15]:
def plot_duration_boxplot(df: pd.DataFrame, column_name: str, title: str):
    """
    Generates and displays a boxplot for a given column in a DataFrame.

    Args:
        df: The input DataFrame.
        column_name: The name of the column to plot.
        title: The title of the boxplot.
    """
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[column_name])
    plt.title(title)
    plt.xlabel(f'{column_name.replace("_", " ").title()} (minutes)')
    plt.show()


    plt.show()


In [16]:
def plot_predictions_vs_actual(y_pred: np.ndarray, y_actual: np.ndarray):
    """
    Generates and displays a distribution plot comparing predictions and actual values.

    Args:
        y_pred: The predicted values.
        y_actual: The actual values.
    """
    plt.figure(figsize=(10, 6))
    sns.histplot(y_pred, label='prediction', kde=True, color='skyblue')
    sns.histplot(y_actual, label='actual', kde=True, color='lightcoral')
    plt.title('Distribution of Predictions vs. Actual Values')
    plt.xlabel('Duration (minutes)')
    plt.ylabel('Frequency')
    plt.legend()

In [33]:
# Example usage (replicating your original script's flow using the functions):
if __name__ == '__main__':
    # Define data file paths
    green_taxi_jan_2025 = "https://github.com/Chukwuka1488/MLOps_DTC/raw/main/01-intro/nyc_data/green_tripdata_2025-01.parquet"
    green_taxi_feb_2025 = "https://github.com/Chukwuka1488/MLOps_DTC/raw/main/01-intro/nyc_data/green_tripdata_2025-02.parquet"

    # Load data
    df_jan = load_data(green_taxi_jan_2025)
    df_feb = load_data(green_taxi_feb_2025)

    # --- Preprocessing for Training Data (January) ---
    df_jan = compute_duration(df_jan)
    df_jan_filtered = filter_duration_outliers(df_jan, 1, 60)

    # Calculate fraction remaining for training data
    fraction_jan = calculate_fraction_remaining(df_jan, df_jan_filtered)
    print(f"January - Total records before filtering: {len(df_jan)}")
    print(f"January - Total records after filtering: {len(df_jan_filtered)}")
    print(f"January - Fraction of records remaining: {fraction_jan:.2f}")

    # --- Preprocessing for Validation Data (February) ---
    df_feb = compute_duration(df_feb)
    df_feb_filtered = filter_duration_outliers(df_feb, 1, 60)

    # Calculate fraction remaining for validation data
    fraction_feb = calculate_fraction_remaining(df_feb, df_feb_filtered)
    print(f"\nFebruary - Total records before filtering: {len(df_feb)}")
    print(f"February - Total records after filtering: {len(df_feb_filtered)}")
    print(f"February - Fraction of records remaining: {fraction_feb:.2f}")

    # Define categorical and numerical features (common for both train and val)
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    # Prepare features for training data (fit_transform)
    X_train, dv = prepare_features_train(df_jan_filtered, categorical, numerical)
    # print("\nTraining Feature names:\n", dv.get_feature_names_out())
    # print("\nTraining Feature matrix (first 5 rows):\n", X_train[:5])
    print("\nDimensionality of the training feature matrix (number of columns):", X_train.shape[1])

    # Prepare features for validation data (transform using the fitted dv)
    X_val = prepare_features_val(df_feb_filtered, dv, categorical, numerical)
    # print("\nValidation Feature matrix (first 5 rows):\n", X_val[:5])
    print("\nDimensionality of the validation feature matrix (number of columns):", X_val.shape[1])


    # Extract target variable
    y_train = df_jan_filtered['duration'].values
    y_val = df_feb_filtered['duration'].values

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model on training data
    y_pred_train, rmse_train = evaluate_model(model, X_train, y_train)
    print(f"\nRMSE of the model on the training data: {rmse_train:.2f} minutes")

    # Evaluate model on validation data
    y_pred_val, rmse_val = evaluate_model(model, X_val, y_val)
    print(f"\nRMSE of the model on the validation data: {rmse_val:.2f} minutes")

    # # Plot distributions for training predictions vs actual
    # plot_predictions_vs_actual(y_pred_train, y_train)

    # # Plot distributions for validation predictions vs actual
    # plot_predictions_vs_actual(y_pred_val, y_val)

    # Example of plotting original and filtered duration distributions
    # plot_duration_histogram(df_jan, 'duration', 'Distribution of Duration (Original Jan)')
    # plot_duration_boxplot(df_jan, 'duration', 'Boxplot of Duration (Original Jan)')
    # plot_duration_histogram(df_jan_filtered, 'duration', 'Distribution of Duration (Filtered Jan)')
    # plot_duration_boxplot(df_jan_filtered, 'duration', 'Boxplot of Duration (Filtered Jan)')
    # plot_duration_histogram(df_feb, 'duration', 'Distribution of Duration (Original Feb)')
    # plot_duration_boxplot(df_feb, 'duration', 'Boxplot of Duration (Original Feb)')
    # plot_duration_histogram(df_feb_filtered, 'duration', 'Distribution of Duration (Filtered Feb)')
    # plot_duration_boxplot(df_feb_filtered, 'duration', 'Boxplot of Duration (Filtered Feb)')

    # --- Train and Evaluate Lasso Regression ---
    print("\n--- Lasso Regression Results ---")
    # You can experiment with different alpha values.
    # A larger alpha means stronger regularization (more coefficients forced to zero).
    # A common starting point is 1.0, or you might try smaller values like 0.1, 0.01.
    # Instantiate and train the Lasso model directly
    lasso_model = Lasso(alpha=0.1)
    lasso_model.fit(X_train, y_train)

    # Evaluate Lasso on training data
    y_pred_train_lasso, rmse_train_lasso = evaluate_model(lasso_model, X_train, y_train)
    print(f"RMSE of the Lasso model on the training data: {rmse_train_lasso:.2f} minutes")

    # Evaluate Lasso on validation data
    y_pred_val_lasso, rmse_val_lasso = evaluate_model(lasso_model, X_val, y_val)
    print(f"RMSE of the Lasso model on the validation data: {rmse_val_lasso:.2f} minutes")

    # You can also inspect the number of non-zero coefficients for Lasso
    print(f"Number of non-zero coefficients for Lasso: {np.sum(lasso_model.coef_ != 0)}")
    print(f"Total number of features: {X_train.shape[1]}")

    # Plot distributions for Lasso Regression
    # plot_predictions_vs_actual(y_pred_train_lasso, y_train, " (Lasso Regression Train)")
    # plot_predictions_vs_actual(y_pred_val_lasso, y_val, " (Lasso Regression Validation)")

    # --- Train and Evaluate Ridge Regression ---
    print("\n--- Ridge Regression Results ---")
    # You can experiment with different alpha values.
    # A larger alpha means stronger regularization (more coefficients are shrunk).
    # A common starting point is 1.0, or you might try smaller values like 0.1, 0.01, 10.
    # Instantiate and train the Ridge model directly
    ridge_model = Ridge(alpha=1) # Instantiate the Ridge model
    ridge_model.fit(X_train, y_train) # Fit the model with training data



    # Evaluate Ridge on training data
    y_pred_train_ridge, rmse_train_ridge = evaluate_model(ridge_model, X_train, y_train)
    print(f"RMSE of the Ridge model on the training data: {rmse_train_ridge:.2f} minutes")

    # Evaluate Ridge on validation data
    y_pred_val_ridge, rmse_val_ridge = evaluate_model(ridge_model, X_val, y_val)
    print(f"RMSE of the Ridge model on the validation data: {rmse_val_ridge:.2f} minutes")

    # Ridge typically does not set coefficients to exactly zero, but shrinks them.
    # We can still look at the magnitude of coefficients if desired.
    # print(f"Number of non-zero coefficients for Ridge: {np.sum(ridge_model.coef_ != 0)}") # This will likely be Total number of features

    # Plot distributions for Ridge Regression
    # plot_predictions_vs_actual(y_pred_train_ridge, y_train, " (Ridge Regression Train)")
    # plot_predictions_vs_actual(y_pred_val_ridge, y_val, " (Ridge Regression Validation)")


    # --- Saving the Model and DictVectorizer ---
    # Choose the model you want to save. Here, we'll save the Linear Regression model and the DictVectorizer.
    # You might choose Lasso or Ridge after tuning, if they perform better.
    output_file_model = 'linear_regression_model.bin'
    output_file_dv = 'dict_vectorizer.bin'

    print(f"\nSaving the Linear Regression model to {output_file_model}")
    with open(output_file_model, 'wb') as f_out:
        pickle.dump(model, f_out)
    print("Model saved successfully.")

    print(f"Saving the DictVectorizer to {output_file_dv}")
    with open(output_file_dv, 'wb') as f_out:
        pickle.dump(dv, f_out)
    print("DictVectorizer saved successfully.")

    # --- Example of Loading the Model and DictVectorizer ---
    print("\n--- Testing Model Loading ---")
    try:
        with open(output_file_model, 'rb') as f_in:
            loaded_model = pickle.load(f_in)
        print("Model loaded successfully.")

        with open(output_file_dv, 'rb') as f_in:
            loaded_dv = pickle.load(f_in)
        print("DictVectorizer loaded successfully.")

        # You can now use loaded_model and loaded_dv to make predictions on new data
        # For example, let's predict on the first 5 samples of the validation data
        sample_X_val = X_val[:5]
        sample_y_pred = loaded_model.predict(sample_X_val)
        print(f"\nPredictions on first 5 validation samples using loaded model: {sample_y_pred.round(2)}")
        print(f"Actual values for first 5 validation samples: {y_val[:5].round(2)}")

    except FileNotFoundError:
        print("Saved model or DictVectorizer files not found. Please ensure they were saved correctly.")
    except Exception as e:
        print(f"An error occurred during loading or prediction: {e}")


January - Total records before filtering: 48326
January - Total records after filtering: 46307
January - Fraction of records remaining: 0.96

February - Total records before filtering: 46621
February - Total records after filtering: 44218
February - Fraction of records remaining: 0.95

Dimensionality of the training feature matrix (number of columns): 449

Dimensionality of the validation feature matrix (number of columns): 449

RMSE of the model on the training data: 6.70 minutes

RMSE of the model on the validation data: 7.26 minutes

--- Lasso Regression Results ---
RMSE of the Lasso model on the training data: 8.21 minutes
RMSE of the Lasso model on the validation data: 8.69 minutes
Number of non-zero coefficients for Lasso: 12
Total number of features: 449

--- Ridge Regression Results ---
RMSE of the Ridge model on the training data: 6.68 minutes
RMSE of the Ridge model on the validation data: 7.26 minutes

Saving the Linear Regression model to linear_regression_model.bin
Model s

The validation RMSE (7.26 minutes) is slightly higher than the training RMSE (6.70 minutes). This is a common and generally expected outcome. It suggests that the model is performing reasonably well and generalizing to new data without significant overfitting. If the validation RMSE were much higher than the training RMSE, it would be a strong indicator of overfitting, meaning the model learned the training data too well, including its noise, and struggles with new, slightly different data. A difference of about 0.56 minutes is quite acceptable

### Evaluating the model