# New York City Taxi Trip Duration Prediction

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
df= pd.read_csv('/content/Drive/MyDrive/dibimbing/Special Class 6/HW_data/train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [None]:
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'Y': 1, 'N': 0})

In [None]:
df['year'] = df['pickup_datetime'].dt.year
df['month'] = df['pickup_datetime'].dt.month
df['date'] = df['pickup_datetime'].dt.day
df['hour'] = df['pickup_datetime'].dt.hour

In [None]:
df = df.dropna()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [None]:
df = df.drop_duplicates()

In [None]:
df = df.drop(['id', 'passenger_count', 'pickup_datetime', 'dropoff_datetime'], axis=1)

In [None]:
def remove_outlier(data, column):
  for i in column:
    q75 = np.percentile(data[i], 75)
    q25 = np.percentile(data[i], 25)
    intr_qr = q75-q25
    
    max = q75+(1.5*intr_qr)      
    min = q25-(1.5*intr_qr)

    print('min: ', min, 'max: ',max)
    data = data[((data[i]>= min) & (data[i] <= max))]

  return data 
  

In [None]:
df_outlier = df.copy()

column = df_outlier.columns
df_outlier = remove_outlier(df_outlier, column)
df_outlier = remove_outlier(df_outlier, column)
df_outlier = remove_outlier(df_outlier, ['trip_duration'])

min:  -0.5 max:  3.5
min:  -74.02867126464847 max:  -73.93052673339841
min:  40.69478416442871 max:  40.81076622009277
min:  -74.031967163086 max:  -73.92424011230463
min:  40.69005203247069 max:  40.81709671020509
min:  0.0 max:  0.0
min:  -454.0 max:  1746.0
min:  2016.0 max:  2016.0
min:  -2.5 max:  9.5
min:  -14.5 max:  45.5
min:  -6.0 max:  34.0
min:  -0.5 max:  3.5
min:  -74.02462005615237 max:  -73.93886566162107
min:  40.69740104675293 max:  40.80920219421387
min:  -74.0266914367676 max:  -73.93425369262692
min:  40.69370079040526 max:  40.8142604827881
min:  0.0 max:  0.0
min:  -406.5 max:  1645.5
min:  2016.0 max:  2016.0
min:  -2.5 max:  9.5
min:  -14.5 max:  45.5
min:  -6.0 max:  34.0
min:  -395.0 max:  1621.0


In [None]:
X = df_outlier.drop(['trip_duration'], axis=1)
y = df_outlier[['trip_duration']]

scaler = StandardScaler()
X_scale_stand = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scale_stand, y, test_size=0.25, random_state=42)

y_train = y_train.to_numpy()
y_train = y_train.reshape(len(y_train),)

In [None]:
list_model = [
    DecisionTreeRegressor()    
    ]

In [None]:
def modeling_scaled(X_train, X_test, y_train, y_test, list_model):
  
  result = pd.DataFrame(columns = ['Dataset', 'Method','RMSE', 'MAE', 'MAPE'])

  for model in list_model:
      model.fit(X_train,y_train)
      y_pred = model.predict(X_train)
      
      method = str(type(model)).split('.')[-1][:-2]

      RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
      MAE = mean_absolute_error(y_train, y_pred)
      MAPE = mean_absolute_percentage_error(y_train, y_pred)
      
      result = result.append({'Dataset': 'Train', 'Method': method, 'RMSE' : RMSE, 'MAE': MAE, 'MAPE' : MAPE}, ignore_index=True)

      y_pred = model.predict(X_test)
      
      method = str(type(model)).split('.')[-1][:-2]

      RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
      MAE = mean_absolute_error(y_test, y_pred)
      MAPE = mean_absolute_percentage_error(y_test, y_pred)

      result = result.append({'Dataset': 'Test','Method': method, 'RMSE' : RMSE, 'MAE': MAE, 'MAPE' : MAPE}, ignore_index=True)

  print(result)

In [None]:
modeling_scaled(X_train, X_test, y_train, y_test, list_model)

  result = result.append({'Dataset': 'Train', 'Method': method, 'RMSE' : RMSE, 'MAE': MAE, 'MAPE' : MAPE}, ignore_index=True)


  Dataset                 Method        RMSE         MAE      MAPE
0   Train  DecisionTreeRegressor    1.096226    0.005819  0.000015
1    Test  DecisionTreeRegressor  282.064256  205.586620  0.471421


  result = result.append({'Dataset': 'Test','Method': method, 'RMSE' : RMSE, 'MAE': MAE, 'MAPE' : MAPE}, ignore_index=True)


In [None]:
from sklearn.metrics import make_scorer

model = DecisionTreeRegressor().fit(X_train,y_train)

mse_scores = cross_val_score(model, X_scale_stand, y, cv=8, scoring='neg_mean_squared_error')

rmse_scores = (-mse_scores)**0.5
print(f'Mean RMSE: {rmse_scores.mean():.3f}')
print(f'Standard deviation of RMSE: {rmse_scores.std():.3f}')

Mean RMSE: 280.804
Standard deviation of RMSE: 0.697


In [None]:
param_grid = [
    {
    'max_features': [1, 5, 10, 15, 20, 30], 
    'max_depth': [1, 5, 10, 15, 20, 40, None]
     }
]

model = DecisionTreeRegressor(random_state = 42)

grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

grid_search.best_estimator_

In [None]:
result = pd.DataFrame(columns = ['Dataset', 'Method','RMSE', 'MAE', 'MAPE'])

model = DecisionTreeRegressor(max_features=10, max_depth=15, random_state=42).fit(X_train,y_train)

y_pred = model.predict(X_train)
      
method = str(type(model)).split('.')[-1][:-2]

RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
MAE = mean_absolute_error(y_train, y_pred)
MAPE = mean_absolute_percentage_error(y_train, y_pred)
      
result = result.append({'Dataset': 'Train', 'Method': method, 'RMSE' : RMSE, 'MAE': MAE, 'MAPE' : MAPE}, ignore_index=True)

  result = result.append({'Dataset': 'Train', 'Method': method, 'RMSE' : RMSE, 'MAE': MAE, 'MAPE' : MAPE}, ignore_index=True)
