In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)

#Seaborn is a library for making statistical graphics in Python.
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from datetime import timedelta
import datetime as dt
from datetime import datetime
import calendar

from collections import Counter

In [2]:
train =  pd.read_csv('train.csv',sep =',', nrows = 2000000)

In [3]:
train.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [4]:
test = pd.read_csv('test.csv',sep =',')

In [5]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,2000000.0,2000000.0,2000000.0,1999986.0,1999986.0,2000000.0
mean,11.34779,-72.52321,39.92963,-72.52395,39.92808,1.684113
std,9.852883,12.86804,7.983352,12.77497,10.32382,1.314982
min,-62.0,-3377.681,-3458.665,-3383.297,-3461.541,0.0
25%,6.0,-73.99208,40.73491,-73.99141,40.734,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75312,1.0
75%,12.5,-73.96713,40.7671,-73.96369,40.76809,2.0
max,1273.31,2856.442,2621.628,3414.307,3345.917,208.0


In [6]:
# define rectangular region in latitude and longitude scale
BB = (-74.5, -72.8, 40.5, 41.8)

In [7]:
def select_bdry(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])

In [8]:
train = train[select_bdry(train, BB)]

In [12]:
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

    # define rectangular region
    BB = (-74.5, -72.8, 40.5, 41.8)
    
    # read nyc mask and turn into boolean map with
    nyc_mask = plt.imread('nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    # return only datapoints on land
    return df[idx]

In [13]:
train.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [14]:
train = remove_datapoints_from_water(train)

In [15]:
def clean_data(df):
    # drop key
    #df = df.drop(['key'], axis = 1)
    area = [-74.5, -72.8, 40.5, 41.8]
    
    # df without missing values
    df = df.dropna(how = 'any', axis = 'rows')  
    
    # choosing latitude and longitude within the area
    df = df[select_bdry(df, area)]
    
    # remove points that have location on the sea
    df = remove_datapoints_from_water(df)
    
    # fare amount should be (0,200]
    df = df[(df['fare_amount'] > 0) & (df['fare_amount'] <= 200)]
    #df = df[df['fare_amount'].between(left = 2.5, right = 100)]
    
    # passenger_count should be [0,6]
    df = df[(df['passenger_count'] >= 0) & (df['passenger_count'] <= 6)]
    return df

In [16]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,1957600.0,1957600.0,1957600.0,1957600.0,1957600.0,1957600.0
mean,11.32839,-73.97516,40.75109,-73.97428,40.75143,1.684216
std,9.714251,0.03847423,0.02952658,0.03764636,0.03269997,1.306688
min,-62.0,-74.48963,40.50005,-74.49105,40.50005,0.0
25%,6.0,-73.99229,40.73655,-73.99159,40.73554,1.0
50%,8.5,-73.9821,40.75335,-73.98062,40.75384,1.0
75%,12.5,-73.96834,40.76752,-73.96537,40.76839,2.0
max,500.0,-72.81783,41.69685,-72.81783,41.71463,9.0


In [17]:
test = pd.read_csv('test.csv',sep =',')

In [18]:
train = clean_data(train)