In [19]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

# Downloading the data 

In [2]:
jan_2023_yellow = pd.read_parquet("yellow_tripdata_2023-01.parquet")
jan_2023_yellow.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [3]:
print(len(jan_2023_yellow.columns))

19


# Computing Duration

In [4]:
# jan_2023_yellow.dtypes

In [5]:
jan_2023_yellow['duration'] = jan_2023_yellow['tpep_dropoff_datetime'] - jan_2023_yellow['tpep_pickup_datetime']
jan_2023_yellow['duration'].head()

0   0 days 00:08:26
1   0 days 00:06:19
2   0 days 00:12:45
3   0 days 00:09:37
4   0 days 00:10:50
Name: duration, dtype: timedelta64[us]

In [6]:
jan_2023_yellow['duration'].std()

Timedelta('0 days 00:42:35.661074')

In [7]:
jan_2023_yellow['duration']=jan_2023_yellow['duration'].apply(lambda x: x.total_seconds() / 60)
jan_2023_yellow['duration'].std()

42.59435124195458

In [8]:
jan_2023_yellow['duration']

0           8.433333
1           6.316667
2          12.750000
3           9.616667
4          10.833333
             ...    
3066761    13.983333
3066762    19.450000
3066763    24.516667
3066764    13.000000
3066765    14.400000
Name: duration, Length: 3066766, dtype: float64

# Dropping Outliers

In [9]:
jan_2023_yellow['duration'].max(),jan_2023_yellow['duration'].min()

(10029.183333333332, -29.2)

In [10]:
original_rows=len(jan_2023_yellow)
jan_2023_yellow = jan_2023_yellow[jan_2023_yellow['duration'] < 60]
jan_2023_yellow = jan_2023_yellow[jan_2023_yellow['duration'] > 1]
final_rows=len(jan_2023_yellow)
final_rows/original_rows

0.9811146334607858

# One-hot encoding

In [11]:
required_columns = ['PULocationID','DOLocationID']
reduced_columns_df = pd.DataFrame()
for c in required_columns:
    reduced_columns_df[c] = jan_2023_yellow[c].astype('str')
reduced_columns_df

Unnamed: 0,PULocationID,DOLocationID
0,161,141
1,43,237
2,48,238
3,138,7
4,107,79
...,...,...
3066761,107,48
3066762,112,75
3066763,114,239
3066764,230,79


In [12]:
dict_vec = DictVectorizer()
training_data_dict = reduced_columns_df.to_dict(orient='records')
X_train = dict_vec.fit_transform(training_data_dict)
X_train
# takes about half a minute

<3008849x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6017698 stored elements in Compressed Sparse Row format>

# Train a model

In [13]:
y_train = jan_2023_yellow['duration'].values
y_train

array([ 8.43333333,  6.31666667, 12.75      , ..., 24.51666667,
       13.        , 14.4       ])

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [20]:
train_preds = model.predict(X_train)
train_rmse = root_mean_squared_error(train_preds, y_train)
train_rmse

7.647512040017662

# Evaluate the model

In [None]:
def load_data(path):
    data = pd.read_parquet(path)
    print('data read in')
    
    # Add duration
    data['duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']
    data['duration'] = data['duration'].apply(lambda x: x.total_seconds() / 60)
    print('duration of trip calculated (in minutes)')
    
    # Outliers
    original_rows=len(data)
    data = data[data['duration'] < 60]
    data = data[data['duration'] > 1]
    final_rows=len(data)
    retained_data = final_rows/original_rows
    print(f'Outliers removed. {100*retained_data:.2f}% data retained')
    
    # ground_truth
    ground_truth = data['duration'].values    
    return data, ground_truth
    
def prep_as_model_input(data, dict_vec=None, to_fit=False):
    
    required_columns = ['PULocationID','DOLocationID']
    reduced_columns_df = pd.DataFrame()
    for c in required_columns: reduced_columns_df[c] = data[c].astype('str')        
    X_ = reduced_columns_df.to_dict(orient='records')        
    if to_fit:
        dict_vec = DictVectorizer()    
        X_ = dict_vec.fit_transform(X_)
    else:
        X_ = dict_vec.transform(X_)
    return X_, dict_vec    

In [26]:
val_df, val_target = load_data('yellow_tripdata_2023-02.parquet')    
X_val, _ = prep_as_model_input(data=val_df, dict_vec=dict_vec)

data read in
duration of trip calculated (in minutes)
Outliers removed. 98.00% data retained


In [27]:
val_preds = model.predict(X_val)
val_rmse = root_mean_squared_error(val_preds, val_target)
val_rmse

7.80839910042855