In [None]:
# Author: Chongzheng Zhao
# Kaggle Competition - New York City Taxi Fare Prediction
# FINAL BEAT 1454 TEAMS!
# We team NYCTAXI located at 30 out of 1484 teams, Top 2%!
# Final Score 2.86770
# Competition Official Website: https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
# To see the leaderboard(competition result): https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/leaderboard
# Welcome to my Github profile: https://github.com/ChongzhengZhao/
# Welcome to my Kaggle Profile: https://www.kaggle.com/chongzhengzhao
# Welcome to my Linkedin Profile: https://www.linkedin.com/in/chongzhengzhao/
# Last updated: 30/11/2018

In [None]:
import numpy as np 
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc

In [None]:
# Reading Data
train_df_raw =  pd.read_csv('train.csv')

#  ！NOTE： 下一行修改年份,从2009~2015 

In [None]:
# N O T E: 修改年份,从2009~2015 ####
####
####
train_df = train_df_raw[train_df_raw['year']==2009]
####
####
#### N O T E: 修改年份,从2009~2015 ####

In [None]:
#Drop rows with null values
train_df = train_df.dropna(how = 'any', axis = 'rows')

In [None]:
train_df.shape

In [None]:
def clean_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          # (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]


In [None]:
train_df = clean_df(train_df)

In [None]:
train_df.shape

In [None]:
# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [None]:
def sphere_dist_bear(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = pickup_lon - dropoff_lon
    
    #Compute bearing distance
    a = np.arctan2(np.sin(dlon * np.cos(dropoff_lat)),np.cos(pickup_lat) * np.sin(dropoff_lat) - np.sin(pickup_lat) * np.cos(dropoff_lat) * np.cos(dlon))
    return a

In [None]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)  

In [None]:
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

In [None]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
    
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset

In [None]:
train_df = add_datetime_info(train_df)
train_df = add_airport_dist(train_df)
train_df['distance'] = sphere_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 

train_df['bearing'] = sphere_dist_bear(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude'])                                    
train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])

In [None]:
train_df.head()

In [None]:
train_df.drop(['key', 'pickup_datetime'], axis=1, inplace=True)

In [None]:
train_df.head()

# ！NOTE：跑其他年份时，记得更改下一行导出的文件名，以免覆盖上次导出文件

In [None]:
train_df.to_csv('train_22m_python_2009.csv', index=False)

In [None]:
y = train_df['fare_amount']
train_df = train_df.drop(['fare_amount'],axis=1)

In [None]:
train_df.head()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)

# ！NOTE：跑其他年份时，记得更改下一行导出的文件名，以免覆盖上次导出文件

In [None]:
x_train_list = x_train.index.tolist()
x_test_list = x_test.index.tolist()
temp_train = pd.DataFrame(columns=['x_train_index'],data=x_train_list)
temp_train.head()
temp_train.to_csv('x_train_index_2009.csv', index = False)

# ！NOTE：跑其他年份时，记得更改下一行导出的文件名，以免覆盖上次导出文件

In [None]:
temp_test = pd.DataFrame(columns=['x_test_index'],data=x_test_list)
temp_test.to_csv("x_test_index_2009.csv", index = False)

In [None]:
del train_df
del y
gc.collect()

In [None]:
y_train1=pd.DataFrame(np.array(y_train), columns=['y_train'])
y_valid1=pd.DataFrame(np.array(y_test), columns=['y_valid'])

# ！NOTE：跑其他年份时，记得更改下一行导出的文件名，以免覆盖上次导出文件

In [None]:
y_train1.to_csv('y_train_2009.csv',index=False)
y_valid1.to_csv('y_valid_2009.csv', index=False)

In [None]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.029,
        'max_depth': -1,
        'subsample': 0.8,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }
    
train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)

In [None]:
y_train_pred=model.predict(x_train, num_iteration = model.best_iteration)  
y_valid_pred=model.predict(x_test, num_iteration = model.best_iteration)  

In [None]:
y_train_pred1=pd.DataFrame(np.array(y_train_pred), columns=['y_train_pred'])
y_valid_pred1=pd.DataFrame(np.array(y_valid_pred), columns=['y_valid_pred'])

# ！NOTE：跑其他年份时，记得更改下一行导出的文件名，以免覆盖上次导出文件

In [None]:
y_train_pred1.to_csv('y_train_pred_2009.csv', index=False)
y_valid_pred1.to_csv('y_valid_pred_2009.csv', index=False)

In [None]:
del x_train
del y_train
del x_test
del y_test
del y_train1
del y_valid1
del y_train_pred
del y_valid_pred
del y_train_pred1
del y_valid_pred1
gc.collect()

In [None]:
test_df =  pd.read_csv('test.csv')

In [None]:
test_df = add_datetime_info(test_df)
test_df = add_airport_dist(test_df)
test_df['distance'] = sphere_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                   test_df['dropoff_latitude'] , test_df['dropoff_longitude'])

test_df['bearing'] = sphere_dist_bear(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                    test_df['dropoff_latitude'] , test_df['dropoff_longitude'])  
test_df['pickup_latitude'] = radian_conv(test_df['pickup_latitude'])
test_df['pickup_longitude'] = radian_conv(test_df['pickup_longitude'])
test_df['dropoff_latitude'] = radian_conv(test_df['dropoff_latitude'])
test_df['dropoff_longitude'] = radian_conv(test_df['dropoff_longitude'])

# ！NOTE：跑其他年份时，记得更改下一行导出的文件名，以免覆盖上次导出文件

In [None]:
test_key = test_df['key']
test_df = test_df.drop(['key', 'pickup_datetime'],axis=1)
test_df.to_csv('test_22m_python.csv_2009', index=False)

# ！NOTE：跑其他年份时，记得更改下一行导出的文件名，以免覆盖上次导出文件

In [None]:
#Predict from test set
prediction = model.predict(test_df, num_iteration = model.best_iteration)      
submission = pd.DataFrame({
        "key": test_key,
        "fare_amount": prediction
})

submission.to_csv('submission_22m_python_2009.csv',index=False)

In [None]:
importance_df = pd.DataFrame()
importance_df["feature"] = x_train.columns
importance_df["importance"] = model.feature_importance(importance_type='gain')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
plt.style.use('seaborn-whitegrid')
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=importance_df.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout
plt.show()