In [1]:
import pandas as pd
import numpy as np
from pandas.core.frame import DataFrame
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:

def data_preparation(path: str) -> DataFrame:
  df = pd.read_parquet(path)
  df['duration'] = (df.dropOff_datetime - df.pickup_datetime).apply(lambda x: x.total_seconds() / 60)
  df_prep = df.loc[(df['duration'] >= 1) & (df['duration'] < 61)]
  df_prep = df_prep.fillna(-1)

  df_sh = df.shape[0]
  df_prep_sh = df_prep.shape[0]
  droped_records = df_sh - df_prep_sh
  mis_fr = df_prep[df_prep['PUlocationID'] == -1].shape[0] / df_prep_sh
  avg_dur = df['duration'].mean()

  print(f'Shape of original df is {df_sh}')
  print(f'Average trip duration is {avg_dur}')
  print(f'Count of droped records is {droped_records}')
  print(f'Fractions of missing values is {mis_fr}')
  return df_prep

def data_transformation(df: DataFrame, features: list, target: str, dv: DictVectorizer, flag: str = 'train') -> None:
  y = df[target].values

  for feature in features:
    df[feature] = df[feature].astype(str)

  df_dict = df[features].to_dict(orient='records')
  if flag == 'train':
    X = dv.fit_transform(df_dict)
  else:
    X = dv.transform(df_dict)
  
  return X, y

def fit_predict(X, y, model, flag = 'train'):
  if flag == 'train':
    model.fit(X_train, y)
  RMSE = np.sqrt(mean_squared_error(y, model.predict(X)))
  print(f'RMSE is {RMSE}')


In [3]:
train_path = '/content/fhv_tripdata_2021-01.parquet'
test_path = '/content/fhv_tripdata_2021-02.parquet'

dv = DictVectorizer()
model = LinearRegression()

train_df = data_preparation(train_path)
X_train, y_train = data_transformation(train_df, ['PUlocationID', 'DOlocationID'], 'duration', dv)
fit_predict(X_train, y_train, model)

Shape of original df is 1154112
Average trip duration is 19.1672240937939
Count of droped records is 43239
Fractions of missing values is 0.8351935819846193
RMSE is 10.596034496969224


In [4]:
test_df = data_preparation(test_path)
X_test, y_test = data_transformation(test_df, ['PUlocationID', 'DOlocationID'], 'duration', dv, flag = 'test')
fit_predict(X_test, y_test, model, flag = 'test')

Shape of original df is 1037692
Average trip duration is 20.70698622520125
Count of droped records is 46371
Fractions of missing values is 0.8571058214241402
RMSE is 11.09634287004778
