In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/test']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

default_nrows = 700000
if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_factor = float(os.environ['INPUT_SCALE_FACTOR'])
  scale_input_data(scale_factor)
  nrows = int(scale_factor * default_nrows)
  with open('./input/data.txt', 'w') as file:
    file.write(str(nrows))
elif os.path.exists('./input/data.txt'):
  with open('./input/data.txt', 'r') as file:
    try:
      nrows = int(file.read().strip())
    except:
      nrows = default_nrows
else:
  nrows = default_nrows

In [1]:
# FIRST-AUTHOR: remove IPython commands
# %env JOBLIB_TEMP_FOLDER=/tmp

import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
exec(os.environ['IREWR_IMPORTS'])

# FIRST-AUTHOR: remove ML code
# import xgboost as xgb # XGBoost package
# #from sklearn.model_selection import GridSearchCV

# import matplotlib.pyplot as plt # Matplotlib
# %matplotlib inline

# from datetime import datetime
from dateutil import tz

# FIRST-AUTHOR: remove extra dependency
# from geopy import distance

# Load the datasets

In [2]:
# Let's load 700_000 rows and exclude the first column
train = pd.read_csv("./input/train.csv", parse_dates=['pickup_datetime'], usecols=range(1,8), nrows=nrows)
test = pd.read_csv("./input/test.scaled.csv", parse_dates=['pickup_datetime'])

# Data exploration

In [3]:
print("Train shape: {}".format(train.shape))
train.describe()

Train shape: (700000, 7)


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,700000.0,700000.0,700000.0,699992.0,699992.0,700000.0
mean,11.348549,-72.532739,39.931738,-72.531962,39.915799,1.684223
std,9.856093,12.616534,8.178329,11.583072,8.730926,1.307666
min,-44.9,-3377.680935,-3116.285383,-3383.296608,-3114.338567,0.0
25%,6.0,-73.99204,40.734991,-73.991387,40.734059,1.0
50%,8.5,-73.981793,40.75269,-73.980142,40.753152,1.0
75%,12.5,-73.967112,40.767114,-73.963647,40.768122,2.0
max,500.0,2522.271325,2621.62843,40.851027,405.65,6.0


In [4]:
#Drop rows with null values
train = train.dropna(how = 'any', axis = 'rows')

#Free rides, negative fares and passenger count filtering
train = train[train.eval('(fare_amount > 0) & (passenger_count <= 6)')]

### Coordinates filtering

In [5]:
# FIRST-AUTHOR: remove plotting
# train.iloc[:100000].plot.scatter('pickup_longitude', 'pickup_latitude')
# train.iloc[:100000].plot.scatter('dropoff_longitude', 'dropoff_latitude')
train.iloc[:100000]
train.iloc[:100000]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.841610,40.712278,1
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.761270,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42+00:00,-73.987130,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1
...,...,...,...,...,...,...,...
100007,6.1,2009-12-08 11:41:20+00:00,-73.969406,40.790576,-73.951168,40.785899,1
100008,11.7,2011-03-27 20:54:00+00:00,-73.790987,40.643098,-73.788300,40.641910,1
100009,8.1,2009-06-24 19:27:51+00:00,-73.963237,40.756846,-73.986191,40.746680,2
100010,9.7,2010-10-09 17:17:00+00:00,-73.981470,40.746298,-74.008160,40.738742,2


Pickup and dropoff locations should be within the limits of NYC

In [6]:
# Coordinates filtering
train = train[(train.pickup_longitude >= -77) &
              (train.pickup_longitude <= -70) &
              (train.dropoff_longitude >= -77) &
              (train.dropoff_longitude <= 70) &
              (train.pickup_latitude >= 35) &
              (train.pickup_latitude <= 45) &
              (train.dropoff_latitude >= 35) &
              (train.dropoff_latitude <= 45)
             ]

### Dates conversion and engineering

Fares change according to the date and the hour of the day

In [7]:
# train.pickup_datetime = train.pickup_datetime.dt.tz_localize('UTC')
train.pickup_datetime = train.pickup_datetime.dt.tz_convert(tz.gettz('America/New_York'))

# Fares may change every year
train['year'] = train.pickup_datetime.dt.year

# Different fares during weekdays and weekends
train['dayofweek'] = train.pickup_datetime.dt.dayofweek

# Different fares during public holidays
train['dayofyear'] = train.pickup_datetime.dt.dayofyear

# Different fares in peak periods and off-peak periods
train['hourofday'] = train.pickup_datetime.dt.hour

train = train.drop('pickup_datetime', axis=1)

### Distances engineering

In [8]:
# FIRST-AUTHOR: make notebook run
def distance(*args):
    return 100

# Computes the distance (in miles) between the pickup and the dropoff locations
train['distance'] = train.apply(
# FIRST-AUTHOR: make notebook run
#     lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).miles,
    lambda x: distance((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)),
    axis = 1)

train = train[train.eval('(distance > 0) & (distance < 150)')]
fare_distance_ratio = (train.fare_amount/train.distance)
fare_distance_ratio.describe()

# FIRST-AUTHOR: remove plotting
# (fare_distance_ratio[fare_distance_ratio < 45]).hist()
(fare_distance_ratio[fare_distance_ratio < 45])

# Drop incoherent fares
train = train[fare_distance_ratio < 45]
del fare_distance_ratio

Let's try to see how far from the NYC airports the pickups and the dropoffs are

In [9]:
# Coordinates of the 3 airpots of NYC
airports = {'jfk': [40.6441666, -73.7822222],
            'laguardia': [40.7747222, -73.8719444],
            'newark': [40.6897222, -74.175]}

# Computes the distance between the pickup location and the airport
# FIRST-AUTHOR: make notebook run
# pickup = train.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (airports.get('jfk'))).miles, axis=1)
# # Computes the distance between the dropoff location and the airport
# dropoff = train.apply(lambda x: distance.distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('jfk'))).miles, axis=1)
pickup = train.apply(lambda x: distance((x.pickup_latitude, x.pickup_longitude), (airports.get('jfk'))), axis=1)
dropoff = train.apply(lambda x: distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('jfk'))), axis=1)
# Selects the shortest distance
train['to_jfk'] = pd.concat((pickup, dropoff), axis=1).min(axis=1)

# FIRST-AUTHOR: make notebook run
# pickup = train.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (airports.get('laguardia'))).miles, axis=1)
# dropoff = train.apply(lambda x: distance.distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('laguardia'))).miles, axis=1)
pickup = train.apply(lambda x: distance((x.pickup_latitude, x.pickup_longitude), (airports.get('laguardia'))), axis=1)
dropoff = train.apply(lambda x: distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('laguardia'))), axis=1)
train['to_laguardia'] = pd.concat((pickup, dropoff), axis=1).min(axis=1)

# FIRST-AUTHOR: make notebook run
# pickup = train.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (airports.get('newark'))).miles, axis=1)
# dropoff = train.apply(lambda x: distance.distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('newark'))).miles, axis=1)
pickup = train.apply(lambda x: distance((x.pickup_latitude, x.pickup_longitude), (airports.get('newark'))), axis=1)
dropoff = train.apply(lambda x: distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('newark'))), axis=1)
train['to_newark'] = pd.concat((pickup, dropoff), axis=1).min(axis=1)

del pickup, dropoff

In [10]:
y = train.fare_amount
train = train.drop('fare_amount', axis=1)
#train = train.drop('passenger_count', axis=1)

# Train

In [11]:
# Grid of parameters for XGBoost training
#param_grid = {'n_estimators': [3000],
#              'max_depth': [7, 8, 9],
#              'learning_rate': [0.01, 0.1],
#              'subsample': [0.8, 0.9, 1],
#              'colsample_bytree': [0.8, 0.9, 1],
#              'gamma': [0, 1e-5, 1e-4, 1e-3],
#              'reg_alpha': [1e-4]
#              }


#xgb_grid_search = GridSearchCV(xgb.XGBRegressor(eval_metric='rmse'),
#                               param_grid=param_grid,
#                               cv=3,
#                               n_jobs=-1,
#                               verbose=0)

#xgb_grid_search.fit(train, y)
#print("Best estimator: {}".format(xgb_grid_search.best_params_))
#print("Best score: {}".format(xgb_grid_search.best_score_))

# The best parameters given by the grid search
# FIRST-AUTHOR: remove ML code
# xgb_param = {'eval_metric': 'rmse',
#             'n_estimators': 3000,
#             'max_depth': 9,
#             'learning_rate': 0.1,
#             'subsample': 0.9,
#             'colsample_bytree': 0.8,
#             'gamma': 1e-4,
#             'reg_alpha': 1e-4,
#             'verbose': 0,
#             'n_jobs': -1
#             }

# xgb_model = xgb.XGBRegressor(**xgb_param)
# xgb_model.fit(train, y)
# xgb.plot_importance(xgb_model)

# Test data

In [12]:
# Processing
test_key = test['key']
test = test.drop('key', axis=1)
#test = test.drop('passenger_count', axis=1)

In [13]:
# FIRST-AUTHOR: make notebook run
# test.pickup_datetime = test.pickup_datetime.dt.tz_localize('UTC')
test.pickup_datetime = test.pickup_datetime.dt.tz_convert(tz.gettz('America/New_York'))

test['year'] = test.pickup_datetime.dt.year
test['dayofweek'] = test.pickup_datetime.dt.dayofweek
test['dayofyear'] = test.pickup_datetime.dt.dayofyear
test['hourofday'] = test.pickup_datetime.dt.hour
test = test.drop('pickup_datetime', axis=1)


# FIRST-AUTHOR: make notebook run
# test['distance'] = test.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).miles, axis = 1)

# pickup = test.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (airports.get('jfk'))).miles, axis=1)
# dropoff = test.apply(lambda x: distance.distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('jfk'))).miles, axis=1)
test['distance'] = test.apply(lambda x: distance((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)), axis = 1)
pickup = test.apply(lambda x: distance((x.pickup_latitude, x.pickup_longitude), (airports.get('jfk'))), axis=1)
dropoff = test.apply(lambda x: distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('jfk'))), axis=1)
test['to_jfk'] = pd.concat((pickup, dropoff), axis = 1).min(axis=1)
# FIRST-AUTHOR: make notebook run
# pickup = test.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (airports.get('laguardia'))).miles, axis=1)
# dropoff = test.apply(lambda x: distance.distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('laguardia'))).miles, axis=1)
pickup = test.apply(lambda x: distance((x.pickup_latitude, x.pickup_longitude), (airports.get('laguardia'))), axis=1)
dropoff = test.apply(lambda x: distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('laguardia'))), axis=1)
test['to_laguardia'] = pd.concat((pickup, dropoff), axis = 1).min(axis=1)
# FIRST-AUTHOR: make notebook run
# pickup = test.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (airports.get('newark'))).miles, axis=1)
# dropoff = test.apply(lambda x: distance.distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('newark'))).miles, axis=1)
pickup = test.apply(lambda x: distance((x.pickup_latitude, x.pickup_longitude), (airports.get('newark'))), axis=1)
dropoff = test.apply(lambda x: distance((x.dropoff_latitude, x.dropoff_longitude), (airports.get('newark'))), axis=1)
test['to_newark'] = pd.concat((pickup, dropoff), axis = 1).min(axis=1)
del pickup, dropoff

In [14]:
# FIRST-AUTHOR; remove ML code
# xgb_predict = xgb_model.predict(test)

In [15]:
# FIRST-AUTHOR: remove ML code
# xgb_submission = pd.DataFrame({ 'key': test_key,
#                                'fare_amount': xgb_predict })
xgb_submission = pd.DataFrame({ 'key': test_key,
                               'fare_amount': test_key })
xgb_submission.to_csv("xgb_submission.csv", index=False)