In [105]:
# matplotlib is the basic plotting (graphical) library, it is very old and poverfull, but 
# also too complicated sometimes.
%matplotlib inline 
### this is jupyter "magic" command that lets matplotlib to add plots (graphics) to the notebook
### if you want, try `%matplotlib notebook` instead, it will add some interactivity to the plot

In [106]:
import pylab as plt
### actually pylab is also part of the matplotlib, but this time we import it as an object to work with.

In [107]:
import pandas as pd

## Read Data

In [108]:
# download any file from here
# https://s3.amazonaws.com/tripdata/index.html
# documentation of the dataset is here: https://www.citibikenyc.com/system-data


# this might take a while, especially in the HSU
df = pd.read_csv('data/201607-citibike-tripdata.csv')    

## Shape

In [10]:
## Shape of the dataset
shape = df.shape

print('Dataset contains {r} rows and {c} columns'.format(r=shape[0], c=shape[1]))
# this is not a 

Dataset contains 1380110 rows and 15 columns


In [12]:
## We cant look at hunderd of thousands of rows at once, but let check a few first ones
df.head(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,704,7/1/2016 00:00:02,7/1/2016 00:11:47,459,W 20 St & 11 Ave,40.746745,-74.007756,347,Greenwich St & W Houston St,40.728846,-74.008591,17431,Customer,,0
1,492,7/1/2016 00:00:18,7/1/2016 00:08:31,293,Lafayette St & E 8 St,40.730287,-73.990765,466,W 25 St & 6 Ave,40.743954,-73.991449,24159,Subscriber,1984.0,1
2,191,7/1/2016 00:00:19,7/1/2016 00:03:31,3090,N 8 St & Driggs Ave,40.717746,-73.956001,3107,Bedford Ave & Nassau Ave,40.723117,-73.952123,16345,Subscriber,1986.0,2


In [15]:
## let us take a look at all columns
for i, col in enumerate(df.columns):
    print(i, col)

0 tripduration
1 starttime
2 stoptime
3 start station id
4 start station name
5 start station latitude
6 start station longitude
7 end station id
8 end station name
9 end station latitude
10 end station longitude
11 bikeid
12 usertype
13 birth year
14 gender


## Selection

In [62]:
# get one column
df['tripduration'].head(3)

0    704
1    492
2    191
Name: tripduration, dtype: int64

In [65]:
#get subset of columns
df[['tripduration','birth year']].head(3) # note double brackets here

Unnamed: 0,tripduration,birth year
0,704,
1,492,1984.0
2,191,1986.0


In [80]:
# mask with `loc`
df.loc[df['tripduration']>2_500_000, 'gender']
# you can also mask with simple brackets, but then you'll get the whole set of columns
# df[df['tripduration']>2500000]

29751      None
83383      None
304864     None
412429     male
564328     None
717065     None
1083620    None
Name: gender, dtype: object

In [79]:
df.loc[df['tripduration']>2_500_000, ['birth year','usertype']]

Unnamed: 0,birth year,usertype
29751,,Customer
83383,,Customer
304864,,Customer
412429,1977.0,Subscriber
564328,,Customer
717065,,Customer
1083620,,Customer


In [84]:
# another function, iloc, allows to subset by the order of columns/rows, not their names
df.iloc[-1].head() # if one el is passed, it gives one row (last one, in this case)

tripduration                              330
starttime                  7/31/2016 23:59:59
stoptime                    8/1/2016 00:05:29
start station id                          507
start station name            E 25 St & 2 Ave
start station latitude                 40.739
start station longitude               -73.980
end station id                            433
end station name           E 13 St & Avenue A
end station latitude                   40.730
end station longitude                 -73.981
bikeid                                  24263
usertype                           Subscriber
birth year                           1995.000
gender                                   male
Name: 1380109, dtype: object

In [86]:
df.iloc[:, -1] # last column
df.iloc[:4, :2] # first 4 rows and first 2 columns

Unnamed: 0,tripduration,starttime
0,704,7/1/2016 00:00:02
1,492,7/1/2016 00:00:18
2,191,7/1/2016 00:00:19
3,687,7/1/2016 00:00:24


## Replace/clean

In [17]:
# first, let's see a simple distribution of trips by gender
# but it is really annoying to manually decode 1 and 2, so let's replace them

df['gender'].replace({1:'male', 2:'female', 0:None}, inplace=1) 
# here inplace means "in the same object". without it, object will be the same, and a new df will be returned

## Add new columns

In [104]:
#it is easy to add new columns
df['new_col'] = 'New_column'
## However it is very inefficient to add new rows!

## Count

In [18]:
# now, let's count
df['gender'].value_counts()

male      876368
female    297078
Name: gender, dtype: int64

In [23]:
df['gender'].value_counts(normalize=True) # this will give us the fraction

male      0.746833
female    0.253167
Name: gender, dtype: float64

so, more than 74% of all trips in january were made (or at least paid by) men.

## Stats

how can we get some statistics on the column?


In [41]:
# 1)  overal description
pd.set_option('display.float_format', lambda x: '%.3f' % x) # supressed scientific notation in print
print(df['tripduration'].describe())

count   1380110.000
mean       1000.328
std       13734.618
min          61.000
25%         403.000
50%         672.000
75%        1143.000
max     6707533.000
Name: tripduration, dtype: float64


In [45]:
# note that result of `df['tripduration'].describe()` is a pandas.Series by itself, you can store it!
# but what if I want to calculate particular value?

count = df['tripduration'].count() # same as len(df) but ignore Nones (empty cells)
mean = df['tripduration'].mean() # average trip duration
median = df['tripduration'].median()
min_duration = df['tripduration'].min()
max_duration = df['tripduration'].max()
std = df['tripduration'].std() # standart deviation
percentile = df['tripduration'].quantile(.25) # percentile of given ratio (first quartile in this case)

## Powerfull Groupby

In [24]:
#but sometimes we want to count for the combination of two (or more) parameters.
#in this case, we can use groupby

In [26]:
df.groupby(['usertype', 'gender']).agg({'bikeid':'count'}) 
# it is a little bit confusing, but you can replace #bikeid# here with any other column name, 
# except the 2 we used to group

Unnamed: 0_level_0,Unnamed: 1_level_0,bikeid
usertype,gender,Unnamed: 2_level_1
Customer,female,1080
Customer,male,1533
Subscriber,female,295998
Subscriber,male,874835


In [27]:
# as you can see, distribution here is very different for 2 usertypes.
# we can think that much more men were PLANNING to use citybike, or using it frequently enough to by subscription

In [28]:
# how can I manually calculate the ratios?
# I need to split 4 numbers by the total number of records:

df.groupby(['usertype', 'gender']).agg({'bikeid':'count'}) / len(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,bikeid
usertype,gender,Unnamed: 2_level_1
Customer,female,0.000783
Customer,male,0.001111
Subscriber,female,0.214474
Subscriber,male,0.633888


In [29]:
# or, in case it was not count
x = df.groupby(['usertype', 'gender']).agg({'bikeid':'count'})
x/x.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,bikeid
usertype,gender,Unnamed: 2_level_1
Customer,female,0.00092
Customer,male,0.001306
Subscriber,female,0.252247
Subscriber,male,0.745526


In [30]:
# why is it different? Becouse we don't count NONE gender it the last one!

In [34]:
# Same approach can be used to create a network of stations!
network = df.groupby(['start station name', 'end station name']).agg({'bikeid':'count'})
network.sort_index(inplace=1)

In [35]:
network.head(10) # we will work on that later on!

Unnamed: 0_level_0,Unnamed: 1_level_0,bikeid
start station name,end station name,Unnamed: 2_level_1
1 Ave & E 16 St,1 Ave & E 16 St,73
1 Ave & E 16 St,1 Ave & E 18 St,103
1 Ave & E 16 St,1 Ave & E 30 St,183
1 Ave & E 16 St,1 Ave & E 44 St,51
1 Ave & E 16 St,1 Ave & E 62 St,19
1 Ave & E 16 St,1 Ave & E 68 St,48
1 Ave & E 16 St,1 Ave & E 78 St,13
1 Ave & E 16 St,11 Ave & W 27 St,10
1 Ave & E 16 St,11 Ave & W 41 St,2
1 Ave & E 16 St,11 Ave & W 59 St,1


In [83]:
#by the way, I am really annoyed by this column name. let's change it
network.columns = ['trips'] # as this is the DF, it assumes ther is multiple columns, thus we have to pass a list
network.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,trips
start station name,end station name,Unnamed: 2_level_1
1 Ave & E 16 St,1 Ave & E 16 St,73
1 Ave & E 16 St,1 Ave & E 18 St,103
1 Ave & E 16 St,1 Ave & E 30 St,183
1 Ave & E 16 St,1 Ave & E 44 St,51
1 Ave & E 16 St,1 Ave & E 62 St,19
1 Ave & E 16 St,1 Ave & E 68 St,48
1 Ave & E 16 St,1 Ave & E 78 St,13
1 Ave & E 16 St,11 Ave & W 27 St,10
1 Ave & E 16 St,11 Ave & W 41 St,2
1 Ave & E 16 St,11 Ave & W 59 St,1


In [49]:
# Now, groupby does not limit us to boring counts!
# lets get some stats for subgroups

In [55]:
df.groupby(['usertype', 'gender']).agg({'tripduration':['mean','median','min','max', 'std', 'lam']})

Unnamed: 0_level_0,Unnamed: 1_level_0,tripduration,tripduration,tripduration,tripduration,tripduration
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,min,max,std
usertype,gender,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Customer,female,1915.26,1334,64,196570,7086.336
Customer,male,1464.199,1229,65,42945,1895.461
Subscriber,female,909.563,683,61,1030505,4290.698
Subscriber,male,799.261,582,61,5700729,8038.132


In [56]:
## Note that Subscribers usually ride for a much shorter period of time, but Max is also here

In [61]:
## we can also add "custom" functions - for example, wrapped around the percentile
def last_quartile(x):
    return x.quantile(.75)

df.groupby(['usertype', 'gender']).agg({'tripduration':['mean','median','min','max', 'std', last_quartile]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tripduration,tripduration,tripduration,tripduration,tripduration,tripduration
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,min,max,std,last_quartile
usertype,gender,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Customer,female,1915.26,1334,64,196570,7086.336,1720.25
Customer,male,1464.199,1229,65,42945,1895.461,1650.0
Subscriber,female,909.563,683,61,1030505,4290.698,1102.0
Subscriber,male,799.261,582,61,5700729,8038.132,959.0


note, that wemen ride for a LONGER period of time than men. Does it mean they ride slower, or for a larger distances?

## Unique

In [109]:
#lets now get a clean set of all stations and their lat/lon pairs

In [110]:
cols = ['start station name', 'start station latitude', 'start station longitude']
stations = df[cols]

In [112]:
stations.columns = [x.split(' ')[-1] for x in stations.columns] 
# drop repeating first part, by splitting string by space

In [113]:
# now, lets dedupe our table
stations = stations.drop_duplicates()

In [114]:
stations.shape

(483, 3)