In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
base_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/'
data = 'yellow_tripdata'
year = 2023
months = [1,2]
extension = '.parquet'
dfs = []
for month in months:
    url = f'{base_url}{data}_{year}-{month:02d}{extension}'
    df = pd.read_parquet(url)
    dfs.append(df)
    # print(url)


In [3]:
data = pd.concat(dfs,axis=0).reset_index(drop=True)

In [8]:
data.tpep_pickup_datetime.min(),data.tpep_pickup_datetime.max()

(Timestamp('2008-12-31 23:01:42'), Timestamp('2023-03-07 13:01:28'))

In [7]:
data.tpep_dropoff_datetime.min(),data.tpep_dropoff_datetime.max()

(Timestamp('2008-12-31 23:31:24'), Timestamp('2023-03-07 13:11:17'))

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5980721 entries, 0 to 5980720
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [11]:
(
    data
    .assign(
        duration = lambda df: (df.tpep_dropoff_datetime-df.tpep_pickup_datetime).dt.total_seconds()/60
    )
    .duration.std()
)

In [None]:
(
    data
    .assign(
        duration = lambda df: (df.tpep_dropoff_datetime-df.tpep_pickup_datetime).dt.total_seconds()/60,
        valid_duration = lambda df: ((df.duration >= 1) & (df.duration <= 60)).astype(int),
    )
    .valid_duration.mean()
)

In [None]:
def load_data(month):
    base_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/'
    data = 'yellow_tripdata'
    year = 2023
    extension = '.parquet'
    url = f'{base_url}{data}_{year}-{month:02d}{extension}'
    data = (
        pd.read_parquet(url)
        .assign(
            duration = lambda df: (df.tpep_dropoff_datetime-df.tpep_pickup_datetime).dt.total_seconds()/60,
        )
        .query('(duration>=1)&(duration<=60)')
    )
    categorical = ['PULocationID', 'DOLocationID']
    data[categorical] = data[categorical].astype(str)
    # data['PU_DO'] = data['PULocationID'] + '_' + data['DOLocationID']

    return data.filter(categorical+['duration'])


In [None]:
categorical = ['PULocationID', 'DOLocationID'] #'PULocationID', 'DOLocationID']
df_train = load_data(month=1)
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

In [None]:
df_val = load_data(month=2)
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
target = 'duration'
y_val = df_val[target].values
y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)