# Imports and data

In [1]:
import pandas as pd
import numpy as np
import swifter

These data (~71 million rows) were taken from https://www.kaggle.com/benhamner/sf-bay-area-bike-share/data 

In [2]:
data = pd.read_feather('/home/ec2-user/data/status')

In [3]:
print(data.shape)
data.head()

(71984434, 4)


Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013/08/29 12:06:01
1,2,2,25,2013/08/29 12:07:01
2,2,2,25,2013/08/29 12:08:01
3,2,2,25,2013/08/29 12:09:01
4,2,2,25,2013/08/29 12:10:01


# Apply any function in the fastest available manner

## When possible, vectorized form of function is used for 100x speed of pandas

In [4]:
def bikes_proportion(x, max_x):
    return x * 1.0 / max_x

In [5]:
%time data['bike_prop'] = data['bikes_available'].swifter.apply(bikes_proportion, max_x=np.max(data['bikes_available']))

CPU times: user 832 ms, sys: 976 ms, total: 1.81 s
Wall time: 1.81 s


## When vectorized form is not available, utilized dask parallel processing for 10x speed of pandas

In [6]:
def gt_5_bikes(x):
    if x > 5:
        return True
    else:
        return False

In [7]:
%time data['gt_5_bikes'] = data['bikes_available'].swifter.apply(gt_5_bikes)

CPU times: user 34.1 s, sys: 1.63 s, total: 35.7 s
Wall time: 35.2 s


### But when possible, you should still write code in a vectorized format

In [8]:
def gt_5_bikes_vectorized(x):
    return np.where(x > 5, True, False)

In [9]:
%time data['gt_5_bikes_vec'] = data['bikes_available'].swifter.apply(gt_5_bikes_vectorized)

CPU times: user 144 ms, sys: 88 ms, total: 232 ms
Wall time: 233 ms


In [10]:
data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec
0,2,2,25,2013/08/29 12:06:01,0.074074,False,False
1,2,2,25,2013/08/29 12:07:01,0.074074,False,False
2,2,2,25,2013/08/29 12:08:01,0.074074,False,False
3,2,2,25,2013/08/29 12:09:01,0.074074,False,False
4,2,2,25,2013/08/29 12:10:01,0.074074,False,False


## When you can't write code in a vectorized format, swifter still makes parallel processing easy 

In [11]:
%time data['date'] = data['time'].swifter.apply(pd.to_datetime)

CPU times: user 17.6 s, sys: 708 ms, total: 18.4 s
Wall time: 18.3 s


In [12]:
def convert_to_human(datetime):
    return datetime.weekday_name + ', the ' + str(datetime.day) + 'th day of ' + datetime.strftime("%B") + ', ' + str(datetime.year)

In [13]:
%time data['humanreadable_date'] = data['date'].swifter.apply(convert_to_human)

  


CPU times: user 25min 6s, sys: 14.6 s, total: 25min 21s
Wall time: 25min 1s


In [14]:
data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time,bike_prop,gt_5_bikes,gt_5_bikes_vec,date,humanreadable_date
0,2,2,25,2013/08/29 12:06:01,0.074074,False,False,2013-08-29 12:06:01,"Thursday, the 29th day of August, 2013"
1,2,2,25,2013/08/29 12:07:01,0.074074,False,False,2013-08-29 12:07:01,"Thursday, the 29th day of August, 2013"
2,2,2,25,2013/08/29 12:08:01,0.074074,False,False,2013-08-29 12:08:01,"Thursday, the 29th day of August, 2013"
3,2,2,25,2013/08/29 12:09:01,0.074074,False,False,2013-08-29 12:09:01,"Thursday, the 29th day of August, 2013"
4,2,2,25,2013/08/29 12:10:01,0.074074,False,False,2013-08-29 12:10:01,"Thursday, the 29th day of August, 2013"


# Groupby Apply any function in the fastest available manner

In [3]:
def bikes_proportion(x, max_x):
    return x * 1.0 / max_x

In [4]:
%time bikes_available_by_station = data.loc[:, ['station_id', 'bikes_available']].swifter.\
    groupby_apply('station_id', bikes_proportion, max_x=np.max(data['bikes_available']))

CPU times: user 25.3 s, sys: 34.4 s, total: 59.7 s
Wall time: 52.4 s


In [5]:
bikes_available_by_station

Unnamed: 0_level_0,bikes_available
station_id,Unnamed: 1_level_1
2,0.074074
2,0.333333
2,0.333333
2,0.333333
2,0.333333
2,0.333333
2,0.333333
2,0.333333
2,0.333333
2,0.333333
