In [1]:
import os
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

### FUNCTIONS 

In [2]:
def read_data(filename):

    """
    This function reads a dataset from a Parquet file and returns a DataFrame.

    Args:
    filename: The path to the Parquet file.

    Returns:
    A DataFrame containing the data from the Parquet file.
    """

    # Import the necessary modules.
    #import pandas as pd

    # Check if the filename is a valid path to a Parquet file.
    if not os.path.exists(filename):
        raise FileNotFoundError("The file {} does not exist.".format(filename))

    # Read the data from the Parquet file.
    try:
        data = pd.read_parquet(filename)
    except Exception as e:
        raise ValueError("An error occurred while reading the file {}: {}".format(filename, e))

    # Calculate the trip duration.
    data['trip_duration'] = data.tpep_dropoff_datetime - data.tpep_pickup_datetime

    # Convert the trip duration to minutes.
    data['trip_duration'] = data.trip_duration.apply(lambda x: x.total_seconds()/60)

    # Filter the data to include only trips that lasted between 1 and 60 minutes.
    data = data[(data['trip_duration'] >= 1) & (data['trip_duration'] <= 60)]

    # Convert the pickup and dropoff location IDs to strings.
    data[['PULocationID','DOLocationID']] = data[['PULocationID','DOLocationID']].astype(str)

    # Return the DataFrame.
    return data

In [3]:
def prep_data(data):

    """
    This function prepares the data for machine learning.

    Args:
    data: The DataFrame containing the data.

    Returns:
    A tuple of NumPy arrays containing the features and target values.
    """

    # Check if the data is a DataFrame.
    if not isinstance(data, pd.DataFrame):
        raise TypeError("The data must be a DataFrame.")

    # Create a list of the categorical columns.
    cat_cols = ['PU_DO']

    # Convert the categorical columns to strings.
    data[cat_cols] = data[cat_cols].astype(str)

    # Create a list of the numerical columns.
    num_cols = ['trip_distance']

    # Convert the numerical columns to NumPy arrays.
    num_data = data[num_cols].values

    # Create a list of all the columns.
    global cols
    cols = cat_cols + num_cols

    # Convert the DataFrame to a dictionary format.
    global train_dicts
    train_dicts = data[cat_cols].to_dict(orient='records')

    # Create a DictVectorizer object.
    dv = DictVectorizer()

    # Fit the DictVectorizer object to the data.
    global x
    x = dv.fit_transform(train_dicts)

    # Get the target values.
    global y 
    y = data['trip_distance'].values

    # Return the features and target values.
    return x, y

In [4]:
def train(x, y, model):

    """
    This function trains a machine learning model on the given data.

    Args:
    x: The training data.
    y: The target values for the training data.
    model: The machine learning model to train.

    Returns:
    The trained machine learning model.
    """

    # Import the necessary modules.
    from sklearn.model_selection import train_test_split

    # Split the data into a training set and a test set.
    global y_train
    X_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

    # Choose the model to train.
    if model == 'LinearRegression':
        from sklearn.linear_model import LinearRegression
        model = LinearRegression()
    elif model == 'Lasso':
        from sklearn.linear_model import Lasso
        model = Lasso()
    else:
        from sklearn.linear_model import Ridge
        model = Ridge()

    # Fit the model to the training data.
    model.fit(X_train, y_train)

    # Predict the target values for the test data.
    global pred
    pred = model.predict(x_test)

    # Calculate the mean squared error and accuracy score of the predictions.
    mse = mean_squared_error(y_test, pred)
    score = model.score(x_test,pred)

    # Print the mean squared error.
    print(f'Mean squared error score: {mse}')
    print(f'accuracy score: {score}')

    # Return the trained model.
    return model

In [5]:
#reading in dataset
df_train = read_data('yellow_tripdata_2022-01.parquet')
df_val = read_data('yellow_tripdata_2022-02.parquet')

len(df_train), len(df_val)

(2421440, 2918187)

### EDA

### FEATURE ENGINEERING

In [6]:
categorical = ['PU_DO'] 
numerical = ['trip_distance']

### DATA PREP

In [7]:
prep_data(df_train)

KeyError: "None of [Index(['PU_DO'], dtype='object')] are in the [columns]"

In [None]:
checker = df_train[cols]

In [None]:
checker.info()

In [None]:
train_dicts

### MODELLING

In [None]:
# ols
train(x,y,'LinearRegression')

In [None]:
#
sb.histplot(y_train,label="actual_values",kde=True,stat="density",
            kde_kws=dict(cut=3), bins=50,alpha=.4, edgecolor=(1, 1, 1, 0.4))
sb.histplot(pred,label="predicted_values",kde=True,stat="density",
            kde_kws=dict(cut=3), bins=50,alpha=.4, edgecolor=(1, 1, 1, 0.4))
plt.legend()

In [None]:
# lasso regression
train(x,y,'Lasso')

In [None]:
#ridge regression
train(x,y,'Ridge')

In [None]:
with open('model.pkl','wb') as fb:
    joblib.dump(model,file)