In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [2]:
sample_fraction = .01

In [3]:
df = pd.read_csv('train.csv',nrows=0)

In [4]:
def skip_row(row_indx):
    if row_indx==0:
        return False
    return random.random()> sample_fraction

random.seed(69)

In [5]:
df.columns
selected_cols = ['fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count']
dtypes ={'fare_amount': 'float32', 
         'pickup_datetime': 'float32', 
         'pickup_longitude': 'float32',
         'pickup_latitude': 'float32', 
         'dropoff_longitude': 'float32', 
         'dropoff_latitude': 'float32',
         'passenger_count': 'uint8'}

df = pd.read_csv('train.csv',usecols=selected_cols,dtype=dtypes,skiprows=skip_row,parse_dates=['pickup_datetime'])

In [6]:
test_df = pd.read_csv('test.csv',dtype=dtypes,parse_dates=['pickup_datetime'])

In [7]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,553990.0,553990.0,553990.0,553986.0,553986.0,553990.0
mean,11.352805,-72.513634,39.925739,-72.510345,39.925522,1.685391
std,9.812602,11.352263,8.006535,11.018735,8.404864,1.33739
min,-52.0,-2004.19043,-2478.886475,-863.976074,-3112.423828,0.0
25%,6.0,-73.992088,40.734928,-73.991402,40.734161,1.0
50%,8.5,-73.981796,40.752666,-73.980125,40.753231,1.0
75%,12.5,-73.967041,40.767208,-73.96357,40.768131,2.0
max,499.98999,2116.788574,2892.856445,2116.788574,2892.856445,208.0


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_df , val_df = train_test_split(df,test_size=.2,random_state=69)

In [10]:
len(val_df)

110798

In [11]:
len(train_df)

443192

In [12]:
df.isna().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    4
dropoff_latitude     4
passenger_count      0
dtype: int64

In [13]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [14]:
train_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [15]:
input_cols =[ 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
target_cols = 'fare_amount'

In [16]:
train_input = train_df[input_cols]
train_target = train_df[target_cols]

test_inputs = test_df[input_cols]

val_input = val_df[input_cols]
val_target = val_df[target_cols]

In [17]:
class MeanRegressor:
    def fit(self,inputs,targets):
        self.mean = targets.mean()
    def predit(self,inputs):
        return np.full(inputs.shape[0],self.mean)

In [18]:
mean_model = MeanRegressor()

mean_model.fit(train_input,train_target)

mean_model.mean

np.float32(11.355893)

In [19]:
val_preds = mean_model.predit(val_input)
train_preds= mean_model.predit(train_input)

In [20]:
val_preds

array([11.355893, 11.355893, 11.355893, ..., 11.355893, 11.355893,
       11.355893], shape=(110798,), dtype=float32)

In [21]:
from sklearn.metrics import root_mean_squared_error

In [22]:
def rmse(targets,preds):
    return np.sqrt(root_mean_squared_error(targets,preds))

In [23]:
rmse(train_target,train_preds)

np.float64(3.1290909501899344)

In [24]:
rmse(val_target,val_preds)

np.float64(3.146079872693739)

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
model = LinearRegression()

In [27]:
model.fit(train_input,train_target)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [28]:
train_preds = model.predict(train_input)

In [29]:
rmse(train_target,train_preds)

np.float64(3.128789663821621)

In [30]:
def add_dataparts(df,col):
    df[col +'_year']=df[col].dt.year
    df[col+'_month']=df[col].dt.month
    df[col+'_day'] = df[col].dt.day
    df[col+'_weekday']=df[col].dt.weekday
    df[col+'_hour'] = df[col].dt.hour

In [31]:
train_df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [32]:
add_dataparts(train_df,'pickup_datetime')
add_dataparts(val_df,'pickup_datetime')
add_dataparts(test_df,'pickup_datetime')

In [33]:
def haversine_np(lon1,lat1,lon2,lat2):
    lon1,lat1,lon2,lat2 = map(np.radians,[lon1,lat1,lon2,lat2])

    dlon = lon2-lon1
    dlat = lat2-lat1

    a=np.sin(dlat/2.0)**2 +np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2*np.arcsin(np.sqrt(a))
    km=6367*c
    return km

In [34]:
def add_trip_distance(df):
    df['trip_distance'] = haversine_np(df['pickup_longitude'],df['pickup_latitude'],df['dropoff_longitude'],df['dropoff_latitude'])

In [35]:
add_trip_distance(train_df)
add_trip_distance(val_df)
add_trip_distance(test_df)

In [36]:
jfk_lonlat= -73.7781,40.6413
lga_lonlat = -73.8740,40.7769
ewr_lonlat=-74.1745,40.6895
met_lonlat=-73.9632,40.7794
wtc_lonlat=-74.0099,40.7126



In [37]:
def add_landmark_dropoff_distance(df,landmark_name,landmark_lonlat):
    lon,lat = landmark_lonlat
    df[landmark_name + '_drop_distance'] = haversine_np(lon,lat,df['dropoff_longitude'],df['dropoff_latitude']) 

In [38]:
def add_landmark(df):
    landmarks=[('jfk',jfk_lonlat),('lga',lga_lonlat),('ewr',ewr_lonlat),('met',met_lonlat),('wtc',wtc_lonlat)]
    for name ,lonlat in landmarks:
        add_landmark_dropoff_distance(df,name,lonlat)

In [39]:
add_landmark(train_df)
add_landmark(val_df)
add_landmark(test_df)

In [49]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,553990.0,553990.0,553990.0,553986.0,553986.0,553990.0
mean,11.352805,-72.513634,39.925739,-72.510345,39.925522,1.685391
std,9.812602,11.352263,8.006535,11.018735,8.404864,1.33739
min,-52.0,-2004.19043,-2478.886475,-863.976074,-3112.423828,0.0
25%,6.0,-73.992088,40.734928,-73.991402,40.734161,1.0
50%,8.5,-73.981796,40.752666,-73.980125,40.753231,1.0
75%,12.5,-73.967041,40.767208,-73.96357,40.768131,2.0
max,499.98999,2116.788574,2892.856445,2116.788574,2892.856445,208.0


In [50]:
def remove_outliners(df):
    return df[(df['fare_amount']>=1.) & (df['fare_amount'] <=500.) &
              (df['pickup_longitude'] >= -75) & (df['pickup_longitude'] <= -72) &
              (df['dropoff_longitude'] >=-75) & (df['dropoff_longitude'] <= -72) &
              (df['pickup_latitude'] >= 40) & (df['pickup_latitude'] >= 42 ) &
              (df['dropoff_latitude']>=40) & (df['dropoff_latitude'] >=42) &
              (df['passenger_count'] >=1) & (df['passenger_count'] <=6)
               ]

In [52]:

remove_outliners(train_df)
remove_outliners(val_df)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,trip_distance,jfk_drop_distance,lga_drop_distance,ewr_drop_distance,met_drop_distance,wtc_drop_distance


In [66]:
train_df.dtypes

fare_amount                            float32
pickup_datetime            datetime64[ns, UTC]
pickup_longitude                       float32
pickup_latitude                        float32
dropoff_longitude                      float32
dropoff_latitude                       float32
passenger_count                          uint8
pickup_datetime_year                     int32
pickup_datetime_month                    int32
pickup_datetime_day                      int32
pickup_datetime_weekday                  int32
pickup_datetime_hour                     int32
trip_distance                          float32
jfk_drop_distance                      float32
lga_drop_distance                      float32
ewr_drop_distance                      float32
met_drop_distance                      float32
wtc_drop_distance                      float32
dtype: object

In [70]:
train_df.to_parquet('trained_data/train.parquet',engine='fastparquet',index=False)
val_df.to_parquet('trained_data/val.parquet',engine='fastparquet',index=False)