# Model Training

## 1.1 Import Data and Required Packages

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

### Import the sampled CSV file

In [6]:
dataset = pd.read_csv("..\..\data\Yellow_Taxi_Trip_Sample.csv")

In [7]:
dataset.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,07/27/2017 06:04:13 PM,07/27/2017 06:42:53 PM,1,10.8,1,N,143,138,1,36.0,1.0,0.5,5.0,5.76,0.3,48.56
1,2,07/27/2017 06:30:35 PM,07/27/2017 06:52:03 PM,6,6.53,1,N,231,162,1,21.0,1.0,0.5,3.42,0.0,0.3,26.22
2,2,07/27/2017 06:01:39 PM,07/27/2017 06:24:32 PM,1,3.11,1,N,100,238,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8
3,2,07/27/2017 06:31:48 PM,07/27/2017 06:43:38 PM,1,0.71,1,N,233,230,1,8.5,1.0,0.5,2.06,0.0,0.3,12.36
4,2,07/27/2017 06:10:56 PM,07/27/2017 06:19:01 PM,1,1.35,1,N,186,249,1,7.5,1.0,0.5,2.79,0.0,0.3,12.09


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113118 entries, 0 to 113117
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               113118 non-null  int64  
 1   tpep_pickup_datetime   113118 non-null  object 
 2   tpep_dropoff_datetime  113118 non-null  object 
 3   passenger_count        113118 non-null  int64  
 4   trip_distance          113118 non-null  float64
 5   RatecodeID             113118 non-null  int64  
 6   store_and_fwd_flag     113118 non-null  object 
 7   PULocationID           113118 non-null  int64  
 8   DOLocationID           113118 non-null  int64  
 9   payment_type           113118 non-null  int64  
 10  fare_amount            113118 non-null  float64
 11  extra                  113118 non-null  float64
 12  mta_tax                113118 non-null  float64
 13  tip_amount             113118 non-null  float64
 14  tolls_amount           113118 non-nu

## Data Preparation

- This entails preparing the data before data transformation

This includes:
- Loading the Raw Data - sampled data
- Feature Creation: [Trip Duration, Hour, Day of Week No, Month, Year]
- Dropping the rows that are not needed for modelling
- Drop Features: ['extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount','tpep_pickup_datetime', 'tpep_dropoff_datetime']

In [9]:
dataset.shape

(113118, 17)

**Feature Creation**

In [10]:
#Converting to datetime
dataset["tpep_pickup_datetime"] = pd.to_datetime(dataset["tpep_pickup_datetime"])
dataset["tpep_dropoff_datetime"] = pd.to_datetime(dataset["tpep_dropoff_datetime"])

  dataset["tpep_pickup_datetime"] = pd.to_datetime(dataset["tpep_pickup_datetime"])
  dataset["tpep_dropoff_datetime"] = pd.to_datetime(dataset["tpep_dropoff_datetime"])


In [11]:
#calculating trip duration(in minutes) using pickup & dropoff times
dataset['trip_duration'] = (dataset["tpep_dropoff_datetime"] - dataset["tpep_pickup_datetime"]).dt.total_seconds() / 60

In [12]:
#Creating the time variables
dataset['pickup_day_no']=dataset['tpep_pickup_datetime'].dt.weekday
dataset['dropoff_day_no']=dataset['tpep_dropoff_datetime'].dt.weekday
dataset['pickup_hour']=dataset['tpep_pickup_datetime'].dt.hour
dataset['dropoff_hour']=dataset['tpep_dropoff_datetime'].dt.hour
dataset['pickup_month']=dataset['tpep_pickup_datetime'].dt.month
dataset['dropoff_month']=dataset['tpep_dropoff_datetime'].dt.month
dataset['pickup_year']=dataset['tpep_pickup_datetime'].dt.year
dataset['dropoff_year']=dataset['tpep_dropoff_datetime'].dt.year

**Filtering the dataset**

In [17]:
#removing records where trip duration, trip distance and total fare amount are recorded as 0
dataset_1 = dataset[(dataset['trip_duration'] !=0) & (dataset['trip_distance']!=0) & (dataset['fare_amount']>0)].reset_index(drop=True)

In [18]:
#Dropping the columns that will NOT be used in the analysis & building the model
#dropping the cplumns
dataset_1.drop(['extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount','tpep_pickup_datetime','tpep_dropoff_datetime'],axis=1,inplace=True)

In [19]:
dataset_1.columns

Index(['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID',
       'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type',
       'fare_amount', 'trip_duration', 'pickup_day_no', 'dropoff_day_no',
       'pickup_hour', 'dropoff_hour', 'pickup_month', 'dropoff_month',
       'pickup_year', 'dropoff_year'],
      dtype='object')

In [20]:
dataset_1.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,trip_duration,pickup_day_no,dropoff_day_no,pickup_hour,dropoff_hour,pickup_month,dropoff_month,pickup_year,dropoff_year
0,2,1,10.8,1,N,143,138,1,36.0,38.666667,3,3,18,18,7,7,2017,2017
1,2,6,6.53,1,N,231,162,1,21.0,21.466667,3,3,18,18,7,7,2017,2017
2,2,1,3.11,1,N,100,238,2,16.0,22.883333,3,3,18,18,7,7,2017,2017
3,2,1,0.71,1,N,233,230,1,8.5,11.833333,3,3,18,18,7,7,2017,2017
4,2,1,1.35,1,N,186,249,1,7.5,8.083333,3,3,18,18,7,7,2017,2017


### Preparing X and Y variables

In [21]:
df = dataset_1.copy()

In [22]:
X = df.drop(columns=['fare_amount'],axis=1)

In [23]:
X.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,trip_duration,pickup_day_no,dropoff_day_no,pickup_hour,dropoff_hour,pickup_month,dropoff_month,pickup_year,dropoff_year
0,2,1,10.8,1,N,143,138,1,38.666667,3,3,18,18,7,7,2017,2017
1,2,6,6.53,1,N,231,162,1,21.466667,3,3,18,18,7,7,2017,2017
2,2,1,3.11,1,N,100,238,2,22.883333,3,3,18,18,7,7,2017,2017
3,2,1,0.71,1,N,233,230,1,11.833333,3,3,18,18,7,7,2017,2017
4,2,1,1.35,1,N,186,249,1,8.083333,3,3,18,18,7,7,2017,2017


In [26]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112250 entries, 0 to 112249
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   VendorID            112250 non-null  int64  
 1   passenger_count     112250 non-null  int64  
 2   trip_distance       112250 non-null  float64
 3   RatecodeID          112250 non-null  int64  
 4   store_and_fwd_flag  112250 non-null  object 
 5   PULocationID        112250 non-null  int64  
 6   DOLocationID        112250 non-null  int64  
 7   payment_type        112250 non-null  int64  
 8   trip_duration       112250 non-null  float64
 9   pickup_day_no       112250 non-null  int32  
 10  dropoff_day_no      112250 non-null  int32  
 11  pickup_hour         112250 non-null  int32  
 12  dropoff_hour        112250 non-null  int32  
 13  pickup_month        112250 non-null  int32  
 14  dropoff_month       112250 non-null  int32  
 15  pickup_year         112250 non-nul

In [24]:
y = df['fare_amount']

In [25]:
y

0         36.0
1         21.0
2         16.0
3          8.5
4          7.5
          ... 
112245     9.0
112246     5.5
112247    26.0
112248     8.0
112249    18.5
Name: fare_amount, Length: 112250, dtype: float64

In [27]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [28]:
X = preprocessor.fit_transform(X)

In [29]:
X

array([[ 1.        ,  0.        ,  0.90579638, ...,  0.16838564,
         0.        , -0.00667424],
       [ 1.        ,  0.        ,  0.90579638, ...,  0.16838564,
         0.        , -0.00667424],
       [ 1.        ,  0.        ,  0.90579638, ...,  0.16838564,
         0.        , -0.00667424],
       ...,
       [ 1.        ,  0.        , -1.10400088, ...,  0.16838564,
         0.        , -0.00667424],
       [ 1.        ,  0.        ,  0.90579638, ...,  0.16838564,
         0.        , -0.00667424],
       [ 1.        ,  0.        , -1.10400088, ...,  0.16838564,
         0.        , -0.00667424]])

In [30]:
X.shape

(112250, 18)

In [31]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((89800, 18), (22450, 18))