In [1]:
from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np
import urllib.request
import zipfile
import random
import itertools
import math
import os
import datetime
import sys

In [2]:
import shapefile
from shapely.geometry import Polygon
from descartes.patch import PolygonPatch
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split, cross_val_score
import socket
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from bayes_opt import bayesian_optimization

import statsmodels.api as sm
import sklearn.model_selection as cv
from scipy import stats
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## NYC Taxi Data

#### Download the Trip Record Data

In [3]:
'''
for month in range(1,2):
    urllib.request.urlretrieve("https://s3.amazonaws.com/nyc-tlc/trip+data/"+ \
                               "yellow_tripdata_2018-{0:0=2d}.csv".format(month), 
                               "nyc.2018-{0:0=2d}.csv".format(month))
'''

'\nfor month in range(1,2):\n    urllib.request.urlretrieve("https://s3.amazonaws.com/nyc-tlc/trip+data/"+                                "yellow_tripdata_2018-{0:0=2d}.csv".format(month), \n                               "nyc.2018-{0:0=2d}.csv".format(month))\n'

In [4]:
n = 100
nyc_df = pd.read_csv("datasets/nyc.2018-01.csv", parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], nrows = n)
nyc_df = nyc_df.set_index('tpep_pickup_datetime')
nyc_df.describe()


Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,1.42,1.78,2.335,1.01,169.31,163.51,1.48,11.11,0.485,0.49,1.3331,0.0576,0.294,13.7697
std,0.496045,1.33772,2.209747,0.1,73.621156,74.873698,0.55922,8.320165,0.111351,0.1,1.687668,0.576,0.06,9.144411
min,1.0,1.0,0.0,1.0,4.0,4.0,1.0,-3.0,-0.5,-0.5,0.0,0.0,-0.3,-4.3
25%,1.0,1.0,0.7975,1.0,140.75,97.5,1.0,5.5,0.5,0.5,0.0,0.0,0.3,7.45
50%,1.0,1.0,1.65,1.0,164.0,164.0,1.0,8.75,0.5,0.5,1.0,0.0,0.3,11.23
75%,2.0,2.0,3.0,1.0,236.0,234.0,2.0,14.5,0.5,0.5,2.0875,0.0,0.3,17.1525
max,2.0,6.0,10.9,2.0,263.0,264.0,4.0,52.0,0.5,0.5,9.08,5.76,0.3,52.8


#### Remove the rows that don't belong to the choosen time frame

In [5]:
nyc_df = nyc_df.loc['2018-01']
nyc_df.head()


Unnamed: 0_level_0,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-01-01 00:21:05,1,2018-01-01 00:24:23,1,0.5,1,N,41,24,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8
2018-01-01 00:44:55,1,2018-01-01 01:03:05,1,2.7,1,N,239,140,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3
2018-01-01 00:08:26,1,2018-01-01 00:14:21,2,0.8,1,N,262,141,1,6.0,0.5,0.5,1.0,0.0,0.3,8.3
2018-01-01 00:20:22,1,2018-01-01 00:52:51,1,10.2,1,N,140,257,2,33.5,0.5,0.5,0.0,0.0,0.3,34.8
2018-01-01 00:09:18,1,2018-01-01 00:27:06,2,2.5,1,N,246,239,1,12.5,0.5,0.5,2.75,0.0,0.3,16.55


#### Check for null values

In [6]:
nyc_df.isnull().sum()

VendorID                 0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
dtype: int64

#### Sort the data based on the date information and reindex it

In [7]:
nyc_df.sort_values(by=['tpep_pickup_datetime'])
nyc_df.reset_index(inplace=True)
nyc_df.head()

Unnamed: 0,tpep_pickup_datetime,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2018-01-01 00:21:05,1,2018-01-01 00:24:23,1,0.5,1,N,41,24,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8
1,2018-01-01 00:44:55,1,2018-01-01 01:03:05,1,2.7,1,N,239,140,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3
2,2018-01-01 00:08:26,1,2018-01-01 00:14:21,2,0.8,1,N,262,141,1,6.0,0.5,0.5,1.0,0.0,0.3,8.3
3,2018-01-01 00:20:22,1,2018-01-01 00:52:51,1,10.2,1,N,140,257,2,33.5,0.5,0.5,0.0,0.0,0.3,34.8
4,2018-01-01 00:09:18,1,2018-01-01 00:27:06,2,2.5,1,N,246,239,1,12.5,0.5,0.5,2.75,0.0,0.3,16.55


#### Check for anomalies in the dataset and drop them

In [8]:
rows_to_drop = []

for index, row in nyc_df.iterrows():
    duration = nyc_df.tpep_dropoff_datetime[index] - nyc_df.tpep_pickup_datetime[index]

    if(duration > datetime.timedelta(hours=12)):
        rows_to_drop.append(index)
        
    elif(duration.seconds != 0):
        # Miles per hour, average speed
        speed = (nyc_df.trip_distance[index]/duration.seconds)*60
        
        # Check if a taxi is going faster than they can in real life
        if((speed > 90.0) or (speed < 1.0)):
            rows_to_drop.append(index)
            
nyc_df.drop(nyc_df.index[rows_to_drop])

nyc_df = nyc_df[nyc_df['total_amount'] >= 0]
nyc_df = nyc_df[nyc_df['extra'] >= 0]
nyc_df = nyc_df[nyc_df['mta_tax'] >= 0]
nyc_df = nyc_df[nyc_df['fare_amount'] >= 0]
nyc_df = nyc_df[nyc_df['tolls_amount'] >= 0]
nyc_df = nyc_df[nyc_df['improvement_surcharge'] >= 0]

nyc_df.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,1.414141,1.777778,2.357677,1.010101,170.141414,164.282828,1.454545,11.252525,0.494949,0.5,1.346566,0.058182,0.3,13.952222
std,0.49508,1.344342,2.209267,0.100504,73.522454,74.852786,0.500464,8.238899,0.050252,0.0,1.690848,0.578902,5.021429e-16,9.006003
min,1.0,1.0,0.0,1.0,4.0,4.0,1.0,2.5,0.0,0.5,0.0,0.0,0.3,3.8
25%,1.0,1.0,0.8,1.0,141.0,106.5,1.0,5.5,0.5,0.5,0.0,0.0,0.3,7.53
50%,1.0,1.0,1.7,1.0,164.0,164.0,1.0,9.0,0.5,0.5,1.0,0.0,0.3,11.3
75%,2.0,2.0,3.0,1.0,236.0,234.0,2.0,14.5,0.5,0.5,2.125,0.0,0.3,17.155
max,2.0,6.0,10.9,2.0,263.0,264.0,2.0,52.0,0.5,0.5,9.08,5.76,0.3,52.8


## NYC Taxi Zones

In [9]:
'''
# Download the location Data
urllib.request.urlretrieve("https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip", "taxi_zones.zip")
with zipfile.ZipFile("taxi_zones.zip","r") as zip_ref:
    zip_ref.extractall("./datasets/taxi_zones/shape")
'''

'\n# Download the location Data\nurllib.request.urlretrieve("https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip", "taxi_zones.zip")\nwith zipfile.ZipFile("taxi_zones.zip","r") as zip_ref:\n    zip_ref.extractall("./datasets/taxi_zones/shape")\n'

In [10]:
def get_lat_lon(sf):
    content = []
    for sr in sf.shapeRecords():
        shape = sr.shape
        rec = sr.record
        loc_id = rec[shp_dic['LocationID']]
        
        x = (shape.bbox[0]+shape.bbox[2])/2
        y = (shape.bbox[1]+shape.bbox[3])/2
        
        content.append((loc_id, x, y))
    return pd.DataFrame(content, columns=["LocationID", "longitude", "latitude"])

#### Convert shape file to data frame

In [11]:
sf = shapefile.Reader("datasets/taxi_zones/shape/taxi_zones.shp")
fields_name = [field[0] for field in sf.fields[1:]]
shp_dic = dict(zip(fields_name, list(range(len(fields_name)))))
attributes = sf.records()
shp_attr = [dict(zip(fields_name, attr)) for attr in attributes]

loc_df = pd.DataFrame(shp_attr).join(get_lat_lon(sf).set_index("LocationID"), on="LocationID")
loc_df.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,longitude,latitude
0,1,0.116357,0.000782,Newark Airport,1,EWR,936681.7,190522.130278
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,1033536.0,161853.9823
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,1027136.0,254730.010849
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,990424.0,203100.040432
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,932133.2,139954.541936


#### Remove location id's that are outside of the Manhattan

In [12]:
loc_df = loc_df[loc_df.borough == "Manhattan"]
loc_df.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,longitude,latitude
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,990424.01957,203100.040432
11,12,0.036661,4.2e-05,Battery Park,12,Manhattan,979889.680922,195215.44749
12,13,0.050281,0.000149,Battery Park City,13,Manhattan,979844.241304,198508.159318
23,24,0.047,6.1e-05,Bloomingdale,24,Manhattan,993795.402611,231438.438453
40,41,0.052793,0.000143,Central Harlem,41,Manhattan,997895.956595,232264.009901


#### Convert location ids that belong to Manhattan into a list

In [13]:
manhattan_loc_id_list = loc_df["LocationID"].tolist()
print(manhattan_loc_id_list)

[4, 12, 13, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90, 100, 103, 103, 103, 103, 103, 103, 103, 103, 103, 107, 113, 114, 116, 120, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 262, 263]


## Intersect NYC Taxi and NYC Taxi Zone Data

#### Remove taxi rides that didn't originate from Manhattan

In [14]:
nyc_df = nyc_df[nyc_df['PULocationID'].isin(manhattan_loc_id_list)]

nyc_df.head()

Unnamed: 0,tpep_pickup_datetime,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2018-01-01 00:21:05,1,2018-01-01 00:24:23,1,0.5,1,N,41,24,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8
1,2018-01-01 00:44:55,1,2018-01-01 01:03:05,1,2.7,1,N,239,140,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3
2,2018-01-01 00:08:26,1,2018-01-01 00:14:21,2,0.8,1,N,262,141,1,6.0,0.5,0.5,1.0,0.0,0.3,8.3
3,2018-01-01 00:20:22,1,2018-01-01 00:52:51,1,10.2,1,N,140,257,2,33.5,0.5,0.5,0.0,0.0,0.3,34.8
4,2018-01-01 00:09:18,1,2018-01-01 00:27:06,2,2.5,1,N,246,239,1,12.5,0.5,0.5,2.75,0.0,0.3,16.55


## Weather Data

#### Import the weather dataset

In [15]:
weather_df = pd.read_csv("datasets/weather_data.csv")
weather_df.describe()

Unnamed: 0,year,month,day,temp_high,temp_avg,temp_low,dew_high,dew_avg,dew_low,humidity_high,...,sea_level_press_high,sea_level_press_avg,sea_level_press_low,visibility_high,visibility_avg,visibility_low,wind_high,wind_avg,wind_high.1,precipitation
count,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1089.0,1089.0,1089.0,1091.0,...,1084.0,1084.0,1084.0,1081.0,1081.0,1081.0,1003.0,1003.0,991.0,1001.0
mean,2016.9945,6.538038,15.713107,17.537122,13.804766,9.813016,9.214876,5.996327,2.374656,79.297892,...,1020.748155,1016.976015,1013.330258,15.923219,13.920444,10.00185,21.830508,8.1665,35.651867,3.842567
std,0.816103,3.450192,8.820285,10.044545,9.571911,9.312212,10.035128,10.606601,11.169859,16.005337,...,7.031669,7.425305,7.999756,0.477869,2.986258,6.094776,7.240465,4.012355,10.899861,9.428274
min,2016.0,1.0,1.0,-11.0,-14.0,-18.0,-24.0,-27.0,-28.0,31.0,...,998.0,994.0,979.0,8.0,1.0,0.0,8.0,0.0,11.0,0.0
25%,2016.0,4.0,8.0,9.0,6.0,3.0,2.0,-2.0,-7.0,66.5,...,1016.0,1012.0,1009.0,16.0,13.0,3.0,16.0,5.0,27.0,0.0
50%,2017.0,7.0,16.0,18.0,14.0,9.0,11.0,6.0,3.0,82.0,...,1020.0,1017.0,1013.5,16.0,16.0,13.0,21.0,8.0,34.0,0.0
75%,2018.0,10.0,23.0,27.0,23.0,18.0,18.0,15.0,12.0,93.0,...,1025.0,1022.0,1019.0,16.0,16.0,16.0,26.0,10.0,42.0,2.29
max,2018.0,12.0,31.0,36.0,32.0,27.0,26.0,24.0,22.0,100.0,...,1044.0,1041.0,1036.0,16.0,16.0,16.0,121.0,29.0,77.0,76.96


#### Check the types of the columns

In [16]:
weather_df.dtypes

year                      int64
month                     int64
day                       int64
temp_high                 int64
temp_avg                  int64
temp_low                  int64
dew_high                float64
dew_avg                 float64
dew_low                 float64
humidity_high             int64
humidity_avg              int64
humidity_low              int64
sea_level_press_high    float64
sea_level_press_avg     float64
sea_level_press_low     float64
visibility_high         float64
visibility_avg          float64
visibility_low          float64
wind_high               float64
wind_avg                float64
wind_high.1             float64
precipitation           float64
events                   object
dtype: object

#### Convert events column into multiple columns

In [17]:
weather_df['event_rain'] = 0
weather_df['event_fog'] = 0
weather_df['event_snow'] = 0

for index, row in weather_df.iterrows():
    if(isinstance(weather_df.events[index], str)):
        if("Rain" in weather_df.events[index]):
            weather_df.event_rain[index] = 1

        if("Fog" in weather_df.events[index]):
            weather_df.event_fog[index] = 1

        if("Snow" in weather_df.events[index]):
            weather_df.event_snow[index] = 1

weather_df = weather_df.drop("events", axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#### Add primary key to be used in the NYC taxi data

In [18]:
weather_df.insert(0, "primary_key", "") 

for index, row in weather_df.iterrows():
    key = str(row['year'])
    
    if (row['month'] == 0) or (row['month'] == 1) or (row['month'] == 2) or (row['month'] == 3) or (row['month'] == 4) or (row['month'] == 5) or (row['month'] == 6) or (row['month'] == 7) or (row['month'] == 8) or (row['month'] == 9):
        key = key + "-0" + str(row['month'])
    else:
        key = key + "-" + str(row['month'])
        
    if (row['day'] == 0) or (row['day'] == 1) or (row['day'] == 2) or (row['day'] == 3) or (row['day'] == 4) or (row['day'] == 5) or (row['day'] == 6) or (row['day'] == 7) or (row['day'] == 8) or (row['day'] == 9):
        key = key + "-0" + str(row['day'])
    else:
        key = key + "-" + str(row['day'])
        
    weather_df.primary_key[index] = key


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


#### Sort the data based on the date information and reindex it

In [19]:
weather_df['primary_key'] = pd.to_datetime(weather_df['primary_key'])
weather_df = weather_df.set_index('primary_key')

#weather_df.reset_index(inplace=True)
weather_df.head()

Unnamed: 0_level_0,year,month,day,temp_high,temp_avg,temp_low,dew_high,dew_avg,dew_low,humidity_high,...,visibility_high,visibility_avg,visibility_low,wind_high,wind_avg,wind_high.1,precipitation,event_rain,event_fog,event_snow
primary_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01,2016,1,1,6,3,1,-3.0,-5.0,-9.0,59,...,16.0,16.0,16.0,27.0,12.0,42.0,0.0,0,0,0
2016-01-02,2016,1,2,4,2,0,-7.0,-8.0,-9.0,52,...,16.0,16.0,16.0,24.0,11.0,37.0,0.0,0,0,0
2016-01-03,2016,1,3,7,4,2,-5.0,-6.0,-7.0,56,...,16.0,16.0,16.0,27.0,13.0,42.0,0.0,0,0,0
2016-01-04,2016,1,4,2,-4,-10,-5.0,-13.0,-18.0,59,...,16.0,16.0,16.0,34.0,14.0,50.0,0.0,0,0,0
2016-01-05,2016,1,5,-2,-7,-12,-17.0,-19.0,-21.0,58,...,16.0,16.0,16.0,32.0,8.0,48.0,0.0,0,0,0


#### Check for missing values

In [20]:
weather_df.isnull().sum()

year                      0
month                     0
day                       0
temp_high                 0
temp_avg                  0
temp_low                  0
dew_high                  2
dew_avg                   2
dew_low                   2
humidity_high             0
humidity_avg              0
humidity_low              0
sea_level_press_high      7
sea_level_press_avg       7
sea_level_press_low       7
visibility_high          10
visibility_avg           10
visibility_low           10
wind_high                88
wind_avg                 88
wind_high.1             100
precipitation            90
event_rain                0
event_fog                 0
event_snow                0
dtype: int64

#### Fill rows with missing values using interpolation

In [21]:
weather_df = weather_df.interpolate(method="linear")

#weather_df = weather_df.dropna()
weather_df.isnull().sum()

year                    0
month                   0
day                     0
temp_high               0
temp_avg                0
temp_low                0
dew_high                0
dew_avg                 0
dew_low                 0
humidity_high           0
humidity_avg            0
humidity_low            0
sea_level_press_high    0
sea_level_press_avg     0
sea_level_press_low     0
visibility_high         0
visibility_avg          0
visibility_low          0
wind_high               0
wind_avg                0
wind_high.1             0
precipitation           0
event_rain              0
event_fog               0
event_snow              0
dtype: int64

## Add Weather Data to NYC Taxi Data

In [22]:
nyc_df['rain'] = 0
nyc_df['temperature_avg'] = 0
nyc_df['humidity_avg'] = 0
nyc_df.insert(0, "day", 0) 

for index, row in nyc_df.iterrows():
    string_key = nyc_df.tpep_pickup_datetime[index].strftime("%Y-%m-%d")
    
    row_array = weather_df.loc[string_key]
    
    nyc_df.rain[index] = row_array["precipitation"]
    nyc_df.temperature_avg[index] = row_array["temp_avg"]
    nyc_df.humidity_avg[index] = row_array["humidity_avg"]
    nyc_df.day[index] = int(nyc_df.tpep_pickup_datetime[index].strftime("%d"))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Check for missing values

In [23]:
nyc_df.isnull().sum()

day                      0
tpep_pickup_datetime     0
VendorID                 0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
rain                     0
temperature_avg          0
humidity_avg             0
dtype: int64

In [24]:
#nyc_df.tpep_pickup_datetime.count()
#nyc_df.tpep_pickup_datetime[0].strftime("%Y-%m-%d")
#weather_df.primary_key[0].strftime("%Y-%m-%d")
#weather_df.dtypes
#weather_df.loc["2016-01-10"]

nyc_df.head()

Unnamed: 0,day,tpep_pickup_datetime,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,...,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,rain,temperature_avg,humidity_avg
0,1,2018-01-01 00:21:05,1,2018-01-01 00:24:23,1,0.5,1,N,41,24,...,4.5,0.5,0.5,0.0,0.0,0.3,5.8,0,-11,50
1,1,2018-01-01 00:44:55,1,2018-01-01 01:03:05,1,2.7,1,N,239,140,...,14.0,0.5,0.5,0.0,0.0,0.3,15.3,0,-11,50
2,1,2018-01-01 00:08:26,1,2018-01-01 00:14:21,2,0.8,1,N,262,141,...,6.0,0.5,0.5,1.0,0.0,0.3,8.3,0,-11,50
3,1,2018-01-01 00:20:22,1,2018-01-01 00:52:51,1,10.2,1,N,140,257,...,33.5,0.5,0.5,0.0,0.0,0.3,34.8,0,-11,50
4,1,2018-01-01 00:09:18,1,2018-01-01 00:27:06,2,2.5,1,N,246,239,...,12.5,0.5,0.5,2.75,0.0,0.3,16.55,0,-11,50


## Produce the target data for training

#### Calculate the frequency values of locations based on days

In [25]:
temp_df = nyc_df[['day', 'PULocationID']].copy()
temp_df = temp_df.drop_duplicates(subset=['day', 'PULocationID'], keep='first')
temp_df['freq'] = 0

# Iterate over the unique location and day information
for index, row in temp_df.iterrows():
    day = temp_df.day[index]
    location_id = temp_df.PULocationID[index]
    
    # Get rows from NYC taxi data with matching days
    day_temp_df = nyc_df.loc[nyc_df['day'] == day]
    
    # Count the number of rows with matching PULocationID within the matching days
    count = len(day_temp_df.loc[day_temp_df['PULocationID'] == location_id])

    temp_df.freq[index] = count

temp_df.head()

Unnamed: 0,day,PULocationID,freq
0,1,41,1
1,1,239,4
2,1,262,2
3,1,140,1
4,1,246,1


#### Map frequency data to NYC taxi data to match the number of rows

In [26]:
target_df = pd.DataFrame()
target_df['freq'] = 0

# Iterate over the NYC taxi data
for index, row in nyc_df.iterrows():
    day = nyc_df.day[index]
    location_id = nyc_df.PULocationID[index]
    
    day_temp_df = temp_df.loc[temp_df['day'] == day]
    location_temp = day_temp_df.loc[day_temp_df['PULocationID'] == location_id]
        
    target_df.loc[index] = location_temp.freq.values

target_df.head()

Unnamed: 0,freq
0,1
1,4
2,2
3,1
4,1


## Remove unused features before the training 

In [27]:
nyc_df = nyc_df.drop(columns=['tpep_pickup_datetime',
                              'tpep_dropoff_datetime',
                              'DOLocationID',
                              'VendorID',
                              'RatecodeID',
                              'store_and_fwd_flag',
                              'payment_type',
                              'passenger_count',
                              'fare_amount',
                              'total_amount',
                              'trip_distance',
                              'extra',
                              'mta_tax',
                              'tip_amount',
                              'tolls_amount',
                              'improvement_surcharge'])


In [28]:
nyc_df.dtypes

day                int64
PULocationID       int64
rain               int64
temperature_avg    int64
humidity_avg       int64
dtype: object

In [29]:
target_df.dtypes

freq    int64
dtype: object

In [30]:
#target=nycmodel[['count']]
#data=nycmodel[[col for col in nycmodel.columns if col not in ['count']]]

x_train, x_test, y_train, y_test = cv.train_test_split(nyc_df, target_df, test_size=2.0/10, random_state=5)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(75, 5)
(19, 5)
(75, 1)
(19, 1)


## Linear Regression

In [31]:
reg = LinearRegression().fit(x_train, y_train)

y_predictedValue = reg.predict(x_train)  
rmse = np.sqrt(mean_squared_error(y_train, y_predictedValue))
r2 = reg.score(x_train, y_train)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

The model performance for training set
--------------------------------------
RMSE is 1.5257040791399228
R2 score is 0.2433998167565512


