##**MODELLING TABULAR DATA - + - TEST/TRAIN/SPLIT - + -**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd "/content/drive/My Drive/NYC Taxi"

/content/drive/My Drive/NYC Taxi


In [3]:
ls

'Abhijith_NYC_Taxi(3).ipynb'   tabular_data_jan.parquet          yellow_tripdata_2023-02.parquet
 Abhijith_NYC_Taxi.ipynb       tabular_data_jun.parquet          yellow_tripdata_2023-03.parquet
 NYC.ipynb                     tabular_data_mar.parquet          yellow_tripdata_2023-04.parquet
 tabular_data_apr.parquet      tabular_data_may.parquet          yellow_tripdata_2023-05.parquet
 tabular_data_feb.parquet      ts_data_final.parquet             yellow_tripdata_2023-06.parquet
 tabular_data_final.parquet    yellow_tripdata_2023-01.parquet


In [4]:
from datetime import datetime
from typing import Tuple
import numpy as np
import pandas as pd

In [5]:
df = pd.read_parquet('tabular_data_final.parquet')
df.head()

Unnamed: 0,rides_previous_648_hour,rides_previous_647_hour,rides_previous_646_hour,rides_previous_645_hour,rides_previous_644_hour,rides_previous_643_hour,rides_previous_642_hour,rides_previous_641_hour,rides_previous_640_hour,rides_previous_639_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,19.0,28.0,43.0,33.0,12.0,3.0,2.0,1.0,1.0,1.0,...,3.0,4.0,5.0,3.0,8.0,18.0,22.0,2023-01-28,4,58.0
1,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0,0.0,...,5.0,5.0,7.0,5.0,8.0,15.0,26.0,2023-01-29,4,53.0
2,2.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,2.0,3.0,...,3.0,1.0,5.0,2.0,1.0,2.0,0.0,2023-01-30,4,2.0
3,3.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,6.0,2.0,...,2.0,1.0,1.0,1.0,0.0,0.0,2.0,2023-01-31,4,0.0
4,3.0,16.0,28.0,21.0,12.0,5.0,4.0,2.0,2.0,3.0,...,4.0,2.0,3.0,0.0,5.0,1.0,2.0,2023-01-28,7,1.0


In [6]:
def train_test_split(
    df: pd.DataFrame,
    cutoff_date: datetime,
    target_column_name: str,
    ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    """
    """
    train_data = df[df.pickup_hour < cutoff_date].reset_index(drop=True)
    test_data = df[df.pickup_hour >= cutoff_date].reset_index(drop=True)

    X_train = train_data.drop(columns=[target_column_name])
    y_train = train_data[target_column_name]
    X_test = test_data.drop(columns=[target_column_name])
    y_test = test_data[target_column_name]

    return X_train, y_train, X_test, y_test

In [7]:
X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2023, 4, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(2330, 650)
y_train.shape=(2330,)
X_test.shape=(2592, 650)
y_test.shape=(2592,)


In [8]:
# Creating a class (baselinemodelpreviousHour) - this function tells us the final values(last hour) of a locatiod ID on a particular date

class BaselineModelPreviousHour:
    """
    Prediction = actual demand observed in the last hour
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        """"""
        return X_test[f'rides_previous_1_hour']

In [9]:
model = BaselineModelPreviousHour()
prediction = model.predict(X_test)

In [10]:
prediction

0        7.0
1       16.0
2       19.0
3        4.0
4        3.0
        ... 
2587     0.0
2588     0.0
2589     0.0
2590     0.0
2591     0.0
Name: rides_previous_1_hour, Length: 2592, dtype: float32

In [13]:
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

In [14]:
test_score = mean_absolute_error(y_test, prediction)
print(test_score)

6.822145


In [15]:
# Using only past rides data since LR doesnt ingest date/time

past_rides_columns = [c for c in X_train.columns if c.startswith('rides_')]
X_train_only_numeric = X_train[past_rides_columns]

In [16]:
# Model training

model = LinearRegression()
model.fit(X_train_only_numeric, y_train)

In [17]:
# Removing the non numeric columns in x_test as well

X_test_only_numeric = X_test[past_rides_columns]
predictions = model.predict(X_test_only_numeric)
predictions

array([-1.8068826e-01,  3.9359085e+01,  3.1586409e+01, ...,
       -3.5371780e-02, -3.5371780e-02, -3.5371780e-02], dtype=float32)

In [18]:
test_score = mean_absolute_error(y_test, predictions)
print('test_score of linear regression model:', test_score)

test_score of linear regression model: 5.9234376


- **This linear regression model has a score of 5.92- the models I plan for further testing will be bechmarked against this**