### W23P1 STAT 857 - Data Exploration

In [14]:
import pandas as pd
import numpy as np
from datetime import datetime
import calendar

train = pd.read_csv('W23P1_train.csv')
test = pd.read_csv('W23P1_test.csv')
sub = pd.read_csv('W23P1_sample_submission.csv')

### Checking for missing values

In [2]:
train[pd.isnull(train)].sum()

uid                  0.0
fare_amount          0.0
pickup_datetime        0
pickup_longitude     0.0
pickup_latitude      0.0
dropoff_longitude    0.0
dropoff_latitude     0.0
passenger_count      0.0
dtype: object

In [3]:
test[pd.isnull(test)].sum()

uid                  0.0
pickup_datetime        0
pickup_longitude     0.0
pickup_latitude      0.0
dropoff_longitude    0.0
dropoff_latitude     0.0
passenger_count      0.0
dtype: object

### EDA

In [4]:
train.head()

Unnamed: 0,uid,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,31722,9.0,2013-01-07 01:50:51 UTC,-73.991421,40.75016,-73.98949,40.726085,2
1,14674,14.0,2013-01-15 20:08:00 UTC,-73.997945,40.741057,-73.956223,40.767312,6
2,37571,19.5,2013-01-20 00:25:55 UTC,-73.999161,40.688531,-74.026611,40.616634,1
3,47583,6.0,2013-01-01 02:30:00 UTC,-73.99149,40.744257,-73.980912,40.748492,1
4,29473,33.5,2013-01-02 10:45:00 UTC,-73.972773,40.677702,-73.862242,40.768117,1


In [5]:
print(train.shape)

(35000, 8)


In [8]:
train.describe()

Unnamed: 0,uid,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0
mean,34898.261057,11.838253,-73.975153,40.75112,-73.974668,40.751746,1.709371
std,20219.52804,9.971856,0.034798,0.026839,0.033688,0.03028,1.372106
min,1.0,2.5,-74.277592,40.467615,-74.533332,40.438022,1.0
25%,17282.75,6.5,-73.992154,40.737171,-73.99143,40.736281,1.0
50%,34827.5,9.0,-73.981943,40.754077,-73.980365,40.754365,1.0
75%,52415.25,13.0,-73.967864,40.767686,-73.965351,40.768525,2.0
max,69999.0,160.0,-73.423008,40.888921,-73.35,40.9904,6.0


In [6]:
test.head()

Unnamed: 0,uid,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,3,2013-01-29 09:59:26 UTC,-73.987309,40.729064,-73.99377,40.721086,1
1,10,2013-01-05 14:56:36 UTC,-73.978188,40.754557,-73.973773,40.76033,2
2,15,2013-01-22 14:48:00 UTC,-73.97331,40.78997,-73.94801,40.778685,5
3,16,2013-01-11 23:14:10 UTC,-73.984608,40.72892,-73.991357,40.688171,2
4,17,2013-01-31 02:59:44 UTC,-73.990671,40.724603,-73.983896,40.665502,2


In [7]:
print(test.shape)

(35000, 7)


In [9]:
test.describe()

Unnamed: 0,uid,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0
mean,35102.738943,-73.975468,40.750781,-73.975066,40.751674,1.698
std,20195.043455,0.034432,0.026907,0.032658,0.030258,1.362558
min,3.0,-74.341667,40.492783,-74.420223,40.498304,1.0
25%,17719.75,-73.992267,40.736221,-73.991293,40.736181,1.0
50%,35200.5,-73.98213,40.753391,-73.980507,40.754455,1.0
75%,52581.25,-73.968182,40.767652,-73.965399,40.768595,2.0
max,70000.0,-73.725777,40.905944,-73.05,40.970338,6.0


### Feature Engineering

In [15]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S UTC')
train['pickup_date'] = train['pickup_datetime'].dt.date
train['pickup_day'] = train['pickup_datetime'].apply(lambda x:x.day)
train['pickup_hour'] = train['pickup_datetime'].apply(lambda x:x.hour)
train['pickup_day_of_week'] = train['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
train['pickup_month'] = train['pickup_datetime'].apply(lambda x:x.month)
train['pickup_year'] = train['pickup_datetime'].apply(lambda x:x.year)

In [16]:
train.head()

Unnamed: 0,uid,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_date,pickup_day,pickup_hour,pickup_day_of_week,pickup_month,pickup_year
0,31722,9.0,2013-01-07 01:50:51,-73.991421,40.75016,-73.98949,40.726085,2,2013-01-07,7,1,Monday,1,2013
1,14674,14.0,2013-01-15 20:08:00,-73.997945,40.741057,-73.956223,40.767312,6,2013-01-15,15,20,Tuesday,1,2013
2,37571,19.5,2013-01-20 00:25:55,-73.999161,40.688531,-74.026611,40.616634,1,2013-01-20,20,0,Sunday,1,2013
3,47583,6.0,2013-01-01 02:30:00,-73.99149,40.744257,-73.980912,40.748492,1,2013-01-01,1,2,Tuesday,1,2013
4,29473,33.5,2013-01-02 10:45:00,-73.972773,40.677702,-73.862242,40.768117,1,2013-01-02,2,10,Wednesday,1,2013


In [17]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S UTC')
test['pickup_date'] = test['pickup_datetime'].dt.date
test['pickup_day'] = test['pickup_datetime'].apply(lambda x:x.day)
test['pickup_hour'] = test['pickup_datetime'].apply(lambda x:x.hour)
test['pickup_day_of_week'] = test['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
test['pickup_month'] = test['pickup_datetime'].apply(lambda x:x.month)
test['pickup_year'] = test['pickup_datetime'].apply(lambda x:x.year)