In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt

# alt.renderers.enable('notebook')

## import nyc taxi data

In [2]:
df = pd.read_csv(filepath_or_buffer="sampled_taxi.csv",sep=',',
                         header=0, index_col=None, lineterminator='\n')[:5000]
print(df.shape)

(5000, 18)


In [3]:
df["pickup_datetime"] = pd.to_datetime(df.pickup_datetime)
df["dropoff_datetime"] = pd.to_datetime(df.dropoff_datetime)
df['day'] = df.pickup_datetime.dt.day
df['hour'] = df.pickup_datetime.dt.hour

df.head()

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,Passenger_count,Trip_distance,Fare_amount,Extra,MTA_tax,Tip_amount,Tolls_amount,improvement_surcharge,Total_amount,Payment_type,Trip_type,day,hour
0,2,2016-06-16 10:18:09,2016-06-16 10:32:32,-73.978424,40.666683,-73.950439,40.68103,1,2.58,11.5,0.0,0.5,0.0,0.0,0.3,12.3,2,1.0,16,10
1,2,2016-06-19 16:09:59,2016-06-19 16:24:11,-73.991203,40.685532,-73.985603,40.668816,1,1.87,11.0,0.0,0.5,2.95,0.0,0.3,14.75,1,1.0,19,16
2,1,2016-06-15 11:22:07,2016-06-15 11:34:03,-73.954796,40.789211,-73.941444,40.788609,1,1.0,9.0,0.0,0.5,1.95,0.0,0.3,11.75,1,1.0,15,11
3,2,2016-06-18 06:30:07,2016-06-18 06:37:29,-73.951561,40.811962,-73.967468,40.792873,1,1.78,8.0,0.0,0.5,1.76,0.0,0.3,10.56,1,1.0,18,6
4,2,2016-06-17 20:53:31,2016-06-17 21:05:34,-73.986336,40.703938,-73.997955,40.682999,1,2.29,10.0,0.5,0.5,3.39,0.0,0.3,14.69,1,1.0,17,20


## Line Chart

In [4]:
alt.Chart(df[df['day']==17]).mark_line().encode(
    x = alt.X('pickup_datetime:T'),
    y = alt.Y('mean_dist:Q'),
    color= 'VendorID:N',
).transform_aggregate(
    mean_dist='mean(Trip_distance)',
    groupby=["VendorID", "pickup_datetime"]
)

### Practice
How to show the two line chart in different rows? Can you try to change the aspect ratio and use the best you think?

*hint: aspect_ratio = width/height (width = height * aspect_ratio)

## Table

In [5]:

alt.Chart(df[df['day']==17]).mark_line().encode(
    x = alt.X('hour:N'),
    y = alt.Y('mean_dist:Q'),
    color= 'VendorID:N'
).transform_aggregate(
    mean_dist='mean(Trip_distance)',
    groupby=["VendorID", "hour"]
)

### For mark_circle, mark_square, mark_point, we can use their size (area) to encode a numeric value

In [6]:
alt.Chart(df[df['day']==17]).mark_circle().encode(
    x = alt.X('hour:N'),
    y = alt.Y('VendorID:N'),
    size= 'mean_dist:Q'
).transform_aggregate(
    mean_dist='mean(Trip_distance)',
    groupby=["VendorID", "hour"]
)

In [7]:
alt.Chart(df[df['day']==17]).mark_square().encode(
    x = alt.X('hour:N'),
    y = alt.Y('VendorID:N'),
    size = 'mean_dist:Q'
).transform_aggregate(
    mean_dist='mean(Trip_distance)',
    groupby=["VendorID", "hour"]
)

### Practice: Try to answer the following questions.
- Is a trip with more passengers usually longer than those with only one passenger? 
- When and with how many passengers on June 17th is the average nyc taxi trip the longest?

Please use both color and size to encode the average taxi trip distance in a table. For the color scheme, could you please specify it as 'goldorange'.

- what's the difference of mark_rect() and mark_square()?

## Gantt Chart

#### Now we try to explore the trips on June 17th that started between 6pm and 7pm. 
How does the long trips (>30min) compare with each other in terms of pickup time, dropoff time and duration?

In [8]:
df['duration'] = (df['dropoff_datetime'] - df['pickup_datetime'])
transformation = lambda x: x.components.hours*60 + x.components.minutes + x.components.seconds/60.0
df['duration'] = df['duration'].apply(transformation)

In [9]:
import datetime
event_df = df[(df['day']==17) & (df['hour']==18) & (df['duration']>30)].copy() # use .copy() here to avoid error information
event_df['trip_id'] = event_df.index.values
event_df

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,Passenger_count,Trip_distance,Fare_amount,...,Tip_amount,Tolls_amount,improvement_surcharge,Total_amount,Payment_type,Trip_type,day,hour,duration,trip_id
371,2,2016-06-17 18:54:23,2016-06-17 19:44:56,-73.951279,40.802753,-73.783005,40.648827,1,18.29,52.0,...,12.57,5.54,0.3,75.41,1,1.0,17,18,50.55,371
1183,2,2016-06-17 18:05:11,2016-06-17 18:37:11,-73.953018,40.809536,-73.905098,40.8325,1,3.76,21.0,...,2.0,0.0,0.3,24.8,1,1.0,17,18,32.0,1183
2709,2,2016-06-17 18:32:48,2016-06-17 19:50:22,-73.88607,40.681618,-73.884819,40.850246,2,19.45,66.5,...,0.0,11.75,0.3,83.95,2,1.0,17,18,77.566667,2709
3010,2,2016-06-17 18:33:21,2016-06-17 19:10:19,-73.981102,40.689308,-73.896301,40.664528,1,6.14,26.5,...,0.0,0.0,0.3,28.3,1,1.0,17,18,36.966667,3010
4137,2,2016-06-17 18:43:33,2016-06-17 19:15:40,-73.993164,40.692841,-74.006081,40.735737,1,4.1,21.5,...,5.82,0.0,0.3,29.12,1,1.0,17,18,32.116667,4137
4219,2,2016-06-17 18:19:07,2016-06-17 18:51:29,-73.949387,40.636108,-73.951195,40.697311,1,4.77,22.5,...,0.0,0.0,0.3,26.25,1,1.0,17,18,32.366667,4219


In [10]:
df.columns

Index(['VendorID', 'pickup_datetime', 'dropoff_datetime', 'Pickup_longitude',
       'Pickup_latitude', 'Dropoff_longitude', 'Dropoff_latitude',
       'Passenger_count', 'Trip_distance', 'Fare_amount', 'Extra', 'MTA_tax',
       'Tip_amount', 'Tolls_amount', 'improvement_surcharge', 'Total_amount',
       'Payment_type', 'Trip_type ', 'day', 'hour', 'duration'],
      dtype='object')

In [11]:
alt.Chart(event_df).mark_bar().encode(
    x='pickup_datetime:T',
    x2='dropoff_datetime:T',
    y='trip_id:N',
)

### Practice: Try to plot to answer the following question:
How are the long trips (duration > 30min) on June 18th from  vendor 1 distributed in terms of trip pickup time, drop off time and duration? And how are payment types (1:Credit card, 2:Cash, 3:No charge) distributed among them?

### Practice:
How are the long trips(>30min) on June 18th distributed in terms of pickup time, dropoff time and fare amount?

## Dimensionality Reduction & Projection

### Principal Component Analysis

In [12]:
from sklearn.decomposition import PCA

In [13]:
''' initialize PCA '''
pca = PCA(n_components=2)

''' prepare the features for be analyszed '''
X = df[['Passenger_count', 'Trip_distance', 'Fare_amount', 'Tip_amount', 'day', 'hour', 'duration']].values

''' find the first two principal components '''
reduced_data = pca.fit_transform(X)


### K-Means

In [14]:
from sklearn.cluster import KMeans

In [15]:
kmeans = KMeans( n_clusters=2)
kmeans.fit(reduced_data)
labels = kmeans.predict(reduced_data)

### Practice: 
Try to play with PCA and K-Means, and then plot how the trips distributed in a 2D space.