## MLOps Zoomcamp: Homework for Module 1

By: Matheus Moreno

In [34]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [35]:
CATEGORICAL = ['PUlocationID', 'DOlocationID']
TARGET = 'duration'

In [36]:
df_jan = pd.read_parquet('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet')
df_feb = pd.read_parquet('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet')

In [45]:
def read_dataframe(path: str) -> pd.DataFrame:
    # Retrieve Parquet file
    dataframe = pd.read_parquet(path)
    print(f"Retrieved dataset {path} of size {dataframe.shape}.")

    # Generate duration column
    dataframe['duration'] = dataframe.dropOff_datetime - dataframe.pickup_datetime
    dataframe.duration = dataframe.duration.apply(lambda t: t.total_seconds() / 60)
    print(f"Mean trip duration time: {dataframe.duration.mean()}m")

    # Filter outliers
    dataframe = dataframe[(dataframe.duration >= 1) & (dataframe.duration <= 60)].copy()

    # Fill and convert categorical values
    dataframe[CATEGORICAL] = dataframe[CATEGORICAL].fillna(-1)
    dataframe[CATEGORICAL] = dataframe[CATEGORICAL].astype(str)
    print(f"Percentage of values: {(df_jan.PUlocationID.value_counts(normalize=True) * 100)}%")

    return dataframe

In [47]:
def train(x_train: pd.DataFrame, y_train: pd.DataFrame) -> LinearRegression:
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    return lr

In [48]:
df_jan = read_dataframe('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet')

Retrieved dataset https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet of size (1154112, 7).
Mean trip duration time: 19.167224093791006m
Percentage of values: 221.0    4.366208
206.0    3.639613
129.0    2.785366
92.0     2.667416
115.0    2.095535
           ...   
111.0    0.003064
34.0     0.002553
27.0     0.002042
2.0      0.001021
110.0    0.000511
Name: PUlocationID, Length: 261, dtype: float64%


In [49]:
df_feb = read_dataframe('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet')

Retrieved dataset https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet of size (1037692, 7).
Mean trip duration time: 20.706986225199763m
Percentage of values: -1.0     83.527328
221.0     0.750568
206.0     0.612438
129.0     0.484671
115.0     0.367805
           ...    
111.0     0.000451
27.0      0.000360
34.0      0.000270
2.0       0.000180
110.0     0.000090
Name: PUlocationID, Length: 262, dtype: float64%


In [53]:
train_dicts = df_jan[CATEGORICAL].to_dict(orient='records')
vectorizer = DictVectorizer()

x_train = vectorizer.fit_transform(train_dicts)
model = train(x_train, df_jan[TARGET].values)

y_pred = model.predict(x_train)

mean_squared_error(df_jan[TARGET].values, y_pred, squared=False)

10.528519395347283

In [54]:
train_dicts = df_feb[CATEGORICAL].to_dict(orient='records')
x_val = vectorizer.transform(train_dicts)

y_pred_val = model.predict(x_val)

mean_squared_error(df_feb[TARGET].values, y_pred_val, squared=False)

11.014287883358007