# LAB | Introduction to MLOps

Answer the questions,

Data source: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [2]:
import pandas as pd

In [3]:
pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [4]:
df_jan = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')
df_feb = pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')

**Q1: Read the data for January. How many records are there?**

In [5]:
count_rows = len(df_jan)
print("count rows:", count_rows)

count rows: 1154112


**Q2: What's the average trip duration in January?**

In [6]:
# your code here
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [10]:
df_jan["trip_duration"]= df_jan["dropOff_datetime"] - df_jan["pickup_datetime"]
df_jan['duration_mins'] = df_jan['trip_duration'].apply(lambda td: td.total_seconds() / 60)
average_duration = df_jan["duration_mins"].mean()
print(average_duration)

19.167224093791013


**Q3: How many records did you drop?**

In [None]:
# your code here
#none

NameError: name 'none' is not defined

**What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.**

In [None]:
# your code here
df_jan['PUlocationID'] = df_jan['PUlocationID'].fillna(-1)
fraction_missing = (df_jan['PUlocationID'] == -1).mean()
print("Fraction of missing (as -1):", fraction_missing)


Fraction of missing (as -1): 0.8303067639882438


In [None]:
df_jan_wo_missing_values = df_jan[df_jan['PUlocationID'] != -1]

**Q5: What's the dimensionality of this matrix? (The number of columns).**

In [None]:
# your code here
count_columns= len(df_jan.columns)
print("count columns:", count_columns)

count columns: 8


In [13]:
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,trip_duration,duration_mins
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,0 days 00:17:00,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,0 days 00:17:00,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,0 days 01:50:00,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,0 days 00:08:17,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,0 days 00:15:13,15.216667


**Q6: What's the RMSE on train?**

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Feature und Target
X = (df_jan["dropOff_datetime"].astype('int64') // 10**9) / 60
X = X.values.reshape(-1, 1)   # wichtig: 2D-Array für sklearn
y = df_jan['duration_mins']

# 🔹 1. Train-Test-Split (z. B. 80 % Training, 20 % Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 🔹 2. Modell trainieren
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# 🔹 3. Vorhersagen und RMSE berechnen
y_pred = lin_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Test RMSE: {rmse:.4f}")


def preprocess_datetime(df, datetime_col='dropOff_datetime'):
    """
    Wandelt eine Datetime-Spalte in numerische Features um,
    z.B. Minuten seit Unix-Epoche oder Stunden/Wochentage.
    
    Parameters:
        df (pd.DataFrame): Eingabedaten
        datetime_col (str): Name der Datetime-Spalte
    
    Returns:
        np.ndarray: 2D-Array mit numerischen Features
    """
    # Minuten seit Unix-Epoche
    X = (df[datetime_col].astype('int64') // 10**9) / 60
    X = X.values.reshape(-1, 1)
    return X



Test RMSE: 883.9988


Now, let's put data preprocssing steps in a function so that we can process the validation set in the same way as well.

In [None]:
# your code here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Zielvariable
y = df_jan['duration_mins']

# Train-Test-Split
df_train, df_val, y_train, y_val = train_test_split(
    df_jan, y, test_size=0.2, random_state=42
)

# Feature-Engineering mit der Funktion
X_train = preprocess_datetime(df_train, 'dropOff_datetime')
X_val = preprocess_datetime(df_val, 'dropOff_datetime')

# Modell trainieren
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Vorhersagen
y_pred = lin_reg.predict(X_val)




Validation RMSE: 883.9988


**Q7: What's the RMSE on validation?**

In [24]:
# RMSE berechnen
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

Validation RMSE: 883.9988


## Why to use MLOps as we learn from this excercise ##

In [None]:
# your answer here

## BONUS: 

Now, try and run this notebook on AWS Instance