In [1]:
import pandas as pd
import pyarrow.parquet as pq
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Q1: Number of columns in Jan 2022 Yellow Taxi Trip data

In [2]:
df = pq.read_table(source='yellow_tripdata_2022-01.parquet').to_pandas()
print(f'Number of columns: {df.shape[1]}')

Number of columns: 19


### Q2: Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data

In [13]:
df['duration'] = df['tpep_dropoff_datetime'].astype('datetime64[ns]') - df['tpep_pickup_datetime'].astype('datetime64[ns]')
df['duration_mins'] = df['duration'].dt.total_seconds() / 60

print(f'Std: {df["duration_mins"].std():.2f}')

Std: 46.45


### Q3: Fraction of the records left after dropping the outliers

df.duration_mins.describe(percentiles=[0.95, 0.98, 0.99])

In [4]:
df_ = df[(df.duration_mins >=1) & (df.duration_mins <=60)]

In [5]:
print(f'Percentage remaining after removal of duplicates: \
{df_.shape[0] / df.shape[0] * 100: .2f} %')

Percentage remaining after removal of duplicates:  98.28 %


### Q4: Dimensionality after OHE

In [6]:
categorical = ['PULocationID', 'DOLocationID'] 
numerical = ['trip_distance']

df_.loc[:, categorical] = df_.loc[:, categorical].astype(str)
train_dicts = df_[categorical + numerical].to_dict(orient='records') 

target = 'duration_mins'
X_train = DictVectorizer().fit_transform(train_dicts)
y_train = df_[target].values

print(f'Number of columns: {X_train.shape[1]}')

Number of columns: 516


### Q5: RMSE on train

In [7]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_train)

print(f'Mean square error: {mean_squared_error(y_train, y_pred, squared=False):.2f}')

Mean square error: 6.99


### Q6: RMSE on validation

In [8]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [9]:
df_train = read_dataframe('yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('yellow_tripdata_2022-02.parquet')

In [10]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

print(f'Mean square error: {mean_squared_error(y_val, y_pred, squared=False):.2f}')

Mean square error: 7.79
