In [3]:
import pandas as pd
 

In [3]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-19.0.0-cp39-cp39-manylinux_2_28_x86_64.whl (42.1 MB)
[K     |████████████████████████████████| 42.1 MB 33.7 MB/s eta 0:00:01
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-19.0.0


In [4]:
jan_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')
jan_df.head()
display(jan_df.info())

# Creating a duration column in minutes
jan_df["duration"] = (jan_df["tpep_dropoff_datetime"] - jan_df["tpep_pickup_datetime"]).dt.total_seconds() / 60

# Checking the column
print(jan_df["duration"].head())

# Calculating standard deviaton
std_duration = jan_df["duration"].std()

print(f"Standard Deviation: {std_duration:.2f} minutes")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964624 entries, 0 to 2964623
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

None

0    19.800000
1     6.600000
2    17.916667
3     8.300000
4     6.100000
Name: duration, dtype: float64
Standard Deviation: 34.85 minutes


In [5]:
# Removing outliers: Keeping only trips with duration between 1 and 60 minutes
filtered_jan_df = jan_df[(jan_df["duration"] >= 1) & (jan_df["duration"] <= 60)]

# Calculating the fraction of records left
fraction_left = len(filtered_jan_df) / len(jan_df)
print(f"Fraction of records left: {fraction_left:.2%}")

Fraction of records left: 97.78%


In [8]:
from sklearn.feature_extraction import DictVectorizer

# Converting pickup and dropoff location IDs to string
filtered_jan_df.loc[:, "PULocationID"] = filtered_jan_df["PULocationID"].astype(str)
filtered_jan_df.loc[:, "DOLocationID"] = filtered_jan_df["DOLocationID"].astype(str)

# Creating a list of dictionaries
data_dicts = filtered_jan_df[["PULocationID", "DOLocationID"]].to_dict(orient="records")

# Applying one-hot encoding using DictVectorizer
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(data_dicts)

# Checking the dimensionality of the matrix
num_columns = X_train.shape[1]
print(f"Number of columns: {num_columns}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_jan_df.loc[:, "PULocationID"] = filtered_jan_df["PULocationID"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_jan_df.loc[:, "DOLocationID"] = filtered_jan_df["DOLocationID"].astype(str)


Number of columns: 518


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Defining the target variable (duration)
y_train = filtered_jan_df["duration"].values

# Training a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Making predictions on the training set
y_train_pred = lr.predict(X_train)

# Calculating RMSE on training data
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"RMSE on training data: {rmse_train:.2f}")


RMSE on training data: 7.95


In [None]:
# Load the validation dataset (Feb 2023)
feb_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet')
feb_df.head()

# Create duration column in minutes
feb_df["duration"] = (feb_df["tpep_dropoff_datetime"] - feb_df["tpep_pickup_datetime"]).dt.total_seconds() / 60

# Remove outliers (duration between 1 and 60 minutes)
filtered_feb_df = feb_df[(feb_df["duration"] >= 1) & (feb_df["duration"] <= 60)]

# Convert pickup and dropoff IDs to string
filtered_feb_df["PULocationID"] = filtered_feb_df["PULocationID"].astype(str)
filtered_feb_df["DOLocationID"] = filtered_feb_df["DOLocationID"].astype(str)

# Transform validation data using the trained DictVectorizer
val_dicts = filtered_feb_df[["PULocationID", "DOLocationID"]].to_dict(orient="records")
X_val = dv.transform(val_dicts)

# Predicting on validation set
y_val = filtered_feb_df["duration"].values
y_val_pred = lr.predict(X_val)

# Calculating RMSE on validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"RMSE on validation data: {rmse_val:.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_feb_df["PULocationID"] = filtered_feb_df["PULocationID"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_feb_df["DOLocationID"] = filtered_feb_df["DOLocationID"].astype(str)
