In [1]:
import os
from pathlib import Path
from urllib.parse import urlparse

In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
WORK_DIR = Path(os.getcwd()).parent

DATA_DIR = WORK_DIR / 'data'

MODEL_DIR = WORK_DIR / 'models'

DIR_LIST = [
    DATA_DIR,
    MODEL_DIR
]

for dir in DIR_LIST:
    if not dir.exists():
        os.mkdir(dir)

In [7]:
# from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
data_uri_list = [
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'
    ]

for data_uri in data_uri_list:
    data_uri_name = os.path.basename(urlparse(data_uri).path)
    if not (DATA_DIR / data_uri_name).exists():
        os.system(f'wget {data_uri} -P {str((DATA_DIR / data_uri_name).parent)}')
    else:
        print('file already exist, skipping download...')

file already exist, skipping download...
file already exist, skipping download...


In [8]:
df_2023 = pd.read_parquet(DATA_DIR/'yellow_tripdata_2023-01.parquet')

In [9]:
# q1
df_2023.shape

(3066766, 19)

In [10]:
df_2023['duration'] = df_2023.tpep_dropoff_datetime - df_2023.tpep_pickup_datetime
df_2023.duration = df_2023.duration.apply(lambda td: td.total_seconds() / 60)

In [11]:
# q2
df_2023['duration'].std()

42.59435124195458

In [12]:
# q3
(len(df_2023[(df_2023.duration >= 1) & (df_2023.duration <= 60)]) / len(df_2023))*100

98.1220282212598

In [13]:
df_2023 = df_2023[(df_2023.duration >= 1) & (df_2023.duration <= 60)]

In [14]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df_2023[categorical] = df_2023[categorical].astype(str)

In [15]:
train_dicts = df_2023[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df_2023[target].values

In [16]:
# q4
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

In [17]:
# q5
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)



7.649261931816197

In [18]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [21]:
df_train = read_dataframe(str(DATA_DIR/'yellow_tripdata_2023-01.parquet'))
df_val = read_dataframe(str(DATA_DIR/'yellow_tripdata_2023-02.parquet'))

In [22]:
dv = DictVectorizer()

train_dicts = df_train[
    categorical 
    # + numerical
    ].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[
    categorical
    # + numerical
    ].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [24]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



7.8118186871593