
# Data Preparation

In [1]:
def download_logfiles():
    url = 'http://web.mta.info/developers/'
    import requests
    from bs4 import BeautifulSoup
    from bs4 import SoupStrainer
    r = requests.get(url+'turnstile.html')
    soup = BeautifulSoup(r.text, parse_only=SoupStrainer('a', href=True))
    log_links = [ url + link['href'] for link in soup.find_all('a')]
    log_links = [log_link for log_link in log_links if log_link.find('.txt') >=0 or log_link.find('.xls') >= 0]
    for log_link in log_links:
        download_file(log_link)

In [2]:
def download_file(url):
    import urllib
    filename = url.split('/')[-1]
    urllib.urlretrieve (url, filename)
    print 'Successful downloaded ', filename

In [3]:
#download_logfiles()

In [None]:
'''
def download_file(url):
    import urllib
    filename = url.split('/')[-1]
    urllib.urlretrieve (url, filename)
    print 'Successful downloaded ', filename

def download_logfiles():
    url = 'http://web.mta.info/developers/'
    import requests
    from bs4 import BeautifulSoup
    from bs4 import SoupStrainer
    r = requests.get(url+'turnstile.html')
    soup = BeautifulSoup(r.text, parse_only=SoupStrainer('a', href=True))
    log_links = [ url + link['href'] for link in soup.find_all('a')]
    log_links = [log_link for log_link in log_links if log_link.find('.txt') >=0 or log_link.find('.xls') >= 0]
    for log_link in log_links:
        download_file(log_link)
# download_logfiles()

def split_log_lines(input_filename, output_filename):
    fp = open(input_filename, 'r')
    fp_tar = open(output_filename, 'a')
    for line in fp:
        dat = line.strip().split(',')
        for i in range(3,len(dat),5):
            fp_tar.write(','.join(dat[0:3]+dat[i:i+2]+dat[i+3:i+5]) + '\n')
    fp.close()
    fp_tar.close()
'''  

In [4]:
import os

def split_log_lines(input_filename, raw_data_list):
    fp = open(input_filename, 'r')
    for line in fp:
        dat = line.strip().split(',')
        for i in range(3, len(dat), 5):
            date_temp = dat[i].split('-')
            dat[i] = '20' + '-'.join(date_temp[2: 3] + date_temp[0: 2])
            raw_data_list.append(','.join(dat[0:3] + dat[i:i + 2] + dat[i + 3:i + 5]))
    fp.close()
raw_data_list = []
logs_y13 = [filename for filename in os.listdir(
    './') if filename.find('turnstile_13') >= 0 and filename.find('.txt') >= 0]
for filename in logs_y13:
    split_log_lines(filename, raw_data_list)
split_log_lines('turnstile_140104.txt', raw_data_list)
raw_data_list.sort()

In [5]:
import numpy as np
import bisect
import datetime
station_name = {}
fp = open("Remote-Booth-Station.csv", 'r')
for line in fp:
    line = line.strip().split(',')
    station_name['%s_%s' % (line[1], line[0])] = line[2]
fp.close()


class turnstile:
    days_per_year = 365
    blocks_per_day = 6
    nan = False
    __overall_nanPercentage__ = 0.5  # less than 50% valid log, this turnstile will be ignored in further calculation
    __oneblock_nanPercentage__ = 0.2  # less than 20% valid log in a time block, the NaN days will be given value 0
    __max_possible__ = 50000  # max possible passengers to exit/enter this turnstile
    __max_downtime__ = 86400  # missing value longer then two days, won't do interpolations
    __data_exception_times__ = 10.0  # if one value is 10 times larger than its average value, abandon this data
    def __init__(self, line_list, year):
        __temp_line1__ = line_list[0].split(',')
        self.year = year
        self.days_per_year = (datetime.datetime(year+1, 1, 1)-datetime.datetime(year,1,1)).days
        self.CA = __temp_line1__[0]
        self.UNIT = __temp_line1__[1]
        self.SCP = __temp_line1__[2]
        self.ENTRY = self.get_raw_entry(line_list)
        self.EXIT = self.get_raw_exit(line_list)
        self.entry_sum = np.sum(self.ENTRY)
        self.exit_sum = np.sum(self.EXIT)
        self.busyness = self.entry_sum + self.exit_sum
        self.station_name = self.get_station_name()

    def get_station_name(self):
        station_code = self.CA + '_' + self.UNIT
        if station_code in station_name:
            return station_name[station_code]
        else:
            return "Unknown"

    def get_cumu_list(self, raw_list):
        list_cum = np.zeros(self.days_per_year * self.blocks_per_day + 1, dtype=float)
        list_cum.fill(np.NaN)
        init_time = '%d-01-01 00:00:00' % self.year
        end_time = '%d-01-01 00:00:00' % (self.year + 1)
        temp_time = datetime.datetime.strptime(init_time, '%Y-%m-%d %H:%M:%S')
        count = 0
        while str(temp_time) <= end_time:
            neigh = bisect.bisect(raw_list, str(temp_time))
            inter_val = self.GetInterpT_V(raw_list[neigh - 1: neigh + 1], temp_time)
            list_cum[count] = inter_val
            count += 1
            temp_time += datetime.timedelta(hours=4)
        return self.get_delta_list(list_cum)

    def GetInterpT_V(self, timeab, temp_time):
        # timeab = ["time1val1", "time2val2"]
        if len(timeab) < 2:
            return np.NaN
        time2, value2 = timeab[1].split(',')
        if str(temp_time) == time2:
            return int(value2)
        time1, value1 = timeab[0].split(',')
        if str(temp_time) == time1:
            return int(value1)
        if value1 == value2:
            return int(value1)
        value1, value2 = int(value1), int(value2)
        if value2 < value1:
            return np.NaN
        time1 = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
        time2 = datetime.datetime.strptime(time2, '%Y-%m-%d %H:%M:%S')
        time1 = (temp_time - time1).total_seconds()
        time2 = (time2 - temp_time).total_seconds()
        if time1 > self.__max_downtime__ or time2 > self.__max_downtime__:
            return np.NaN
        inter_value = value1 + (value2-value1) * float(time1) / (time1+time2)
        # two-point linear interpolation
        return int(inter_value)

    def get_delta_list(self, cum_list):
        delta_list = np.ndarray((self.days_per_year, self.blocks_per_day), dtype=float)
        delta_list.fill(np.NaN)
        nan_count = 0
        for i in xrange(self.days_per_year):
            for j in xrange(self.blocks_per_day):
                index_in_cum = i*self.blocks_per_day+j
                if np.isnan(cum_list[index_in_cum]) or np.isnan(cum_list[index_in_cum+1]):
                    delta_list[i, j] = np.NaN
                    nan_count += 1
                else:
                    delta = cum_list[index_in_cum+1] - cum_list[index_in_cum]
                    if delta < 0 or delta > self.__max_possible__:
                        delta_list[i, j] = np.NaN
                        nan_count += 1
                    else:
                        delta_list[i, j] = delta
        self.down_blocks = nan_count
        if float(nan_count) / (self.days_per_year * self.blocks_per_day) > self.__overall_nanPercentage__:
            self.nan = True
        for j in xrange(self.blocks_per_day):
            valid_values = [x for x in delta_list[:, j] if not np.isnan(x)]
            if len(valid_values) == 0:
                    delta_list[:, j] = 0
                    continue
            col_mean = np.mean(valid_values)
            for i in xrange(self.days_per_year):
                if np.isnan(delta_list[i, j]):
                    if float(len(valid_values)) / self.days_per_year < self.__oneblock_nanPercentage__:
                        delta_list[i, j] = 0
                    else:
                        delta_list[i, j] = col_mean
                if delta_list[i, j] > col_mean * self.__data_exception_times__:
                    valid_len = len(valid_values) - 1
                    delta_list[i, j] = (col_mean*(valid_len+1) - delta_list[i, j])/valid_len
        delta_list_int = np.ndarray((self.days_per_year, self.blocks_per_day), dtype=int)
        for i in xrange(self.days_per_year):
            for j in xrange(self.blocks_per_day):
                delta_list_int[i][j] = int(delta_list[i][j])
        return delta_list_int

    def get_raw_entry(self, line_list):
        raw_entry_list = []
        for logs in line_list:
            log = logs.split(',')
            raw_entry_list.append('%s %s,%s' % (log[3], log[4], log[5]))
        return self.get_cumu_list(raw_entry_list)

    def get_raw_exit(self, line_list):
        raw_exit_list = []
        for logs in line_list:
            log = logs.split(',')
            raw_exit_list.append('%s %s,%s' % (log[3], log[4], log[6]))
        return self.get_cumu_list(raw_exit_list)
    
    def __str__(self):
        return '%s_%s_%s %d %s' % (self.CA, self.UNIT, self.SCP, self.business, self.station_name)

In [6]:
turns = {}
curr_turn = raw_data_list[0].split(',')
turn_start_index = 0
for i in xrange(len(raw_data_list)):
    line = raw_data_list[i].split(',')
    if line[0] != curr_turn[0] or line[1] != curr_turn[1] or line[2] != curr_turn[2]:
        turns['_'.join(curr_turn[0:3])] = turnstile(raw_data_list[turn_start_index: i], 2013)
        turn_start_index = i
        #if line[0] != curr_turn[0]:
        #    print '{percent:.2%}'.format(percent=float(i)/len(raw_data_list)) + ' logs finished'
        curr_turn = line

In [7]:
print len(turns)

4618


The dictionary 'turns' includes the turnstile objects for all the recorded turntiles in 2013. The turnstile(.) is the constructor for a turnstile object. Each turnstile object contains the cleaned exits/entries 4 hour counts. The EXIT/ENTRY are the 365*6 2D array, for the 0am, 4am, 8am, 12pm, 16pm, 20pm points of each day. (e.g. a value in 0am gives the counts from 0am to 4am.) Before finding the counts in each time interval, linear interpolation is applied to impute the missing data of the data stream.

# Data Analysis

## 1.What is the total number of entries & exits across the subway system for August 1, 2013?

In [8]:
days0801 = (datetime.datetime.strptime("2013-08-01 12:00:00", '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime("2013-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')).days
print sum([sum(turns[i].ENTRY[days0801]) for i in turns])
print sum([sum(turns[i].EXIT[days0801]) for i in turns])


5595593
4437761


The total number of entries across the subway system for August 1, 2013 is 5595593, whereas the total number of exits across the subway system for August 1, 2013 is 4437761.

## 2.Let’s define the busy-ness as sum of entry & exit count. What station was the busiest on August 1, 2013? What turnstile was the busiest on that date?

In [9]:
a = [(sum(turns[i].ENTRY[days0801])+sum(turns[i].EXIT[days0801]), turns[i].station_name, i) for i in turns]
a.sort()
a[-5:] 

[(10493, 'W 4 ST-WASH SQ', 'N083_R138_01-00-00'),
 (10566, '42 ST-GRD CNTRL', 'R238_R046_00-00-01'),
 (11078, '42 ST-GRD CNTRL', 'R240_R047_00-00-00'),
 (11523, '86 ST', 'R249_R179_01-00-09'),
 (11845, '42 ST-PA BUS TE', 'N063A_R011_00-00-00')]

The busiest turnstile on August 1, 2013 is '42 ST-PA BUS TE', 'N063A_R011_00-00-00'.

In [10]:
import operator
b = {}
for x in turns:
    if turns[x].station_name in b:
        b[turns[x].station_name] += turns[x].busyness
    else:
        b[turns[x].station_name] = turns[x].busyness
sorted(b.items(), key=operator.itemgetter(1))[-5:]

[('86 ST', 64744051),
 ('14 ST-UNION SQ', 68162945),
 ('34 ST-HERALD SQ', 73900262),
 ('42 ST-GRD CNTRL', 87805999),
 ('34 ST-PENN STA', 101597301)]

The busiest station on August 1, 2013 is '34 ST-PENN STA'.

## 3. What were the busiest and least-busy stations in the system over all of July 2013?

In [11]:
days0701 = (datetime.datetime.strptime("2013-07-01 12:00:00", '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime("2013-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')).days
days0801 = (datetime.datetime.strptime("2013-08-01 12:00:00", '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime("2013-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')).days
import operator
c = {}
for x in turns:
    summation = np.sum(turns[x].ENTRY[days0701:days0801]) + np.sum(turns[x].EXIT[days0701:days0801])
    if turns[x].station_name in c:
        c[turns[x].station_name] += summation
    else:
        c[turns[x].station_name] = summation
SortResult=sorted(c.items(), key=operator.itemgetter(1))
SortResult[-5:] 

[('14 ST-UNION SQ', 5555751),
 ('42 ST-TIMES SQ', 5591576),
 ('34 ST-HERALD SQ', 6198209),
 ('42 ST-GRD CNTRL', 7456885),
 ('34 ST-PENN STA', 8596114)]

Above are the busiest stations in the system over all of July 2013 the corresponding entry/exit total counts. Below are the five least-busy stations in the system over all of July 2013 the corresponding entry/exit total counts.

In [12]:
SortResult[:5] 

[('LGA AIRPORT CTB', 0),
 ('AQUEDUCT TRACK', 265),
 ('BROAD CHANNEL', 8391),
 ('ORCHARD BEACH', 16763),
 ('TOMPKINSVILLE', 18357)]

The following shows the first 10 'C/A'-'UNIT' pairs (appeared in the raw_data_list) that are not found in the Remote-Booth-Station.csv.  Remove '[:10]' part for the whole list.

In [13]:
[turns[x].UNIT+'_'+turns[x].CA for x in turns if turns[x].station_name=='Unknown'][:10]

['R028_A082',
 'R028_A082',
 'R202_N330',
 'R202_N330',
 'R168_R169',
 'R202_N330',
 'R202_N330',
 'R202_N330',
 'R001_R101',
 'R001_R101']

## 4. Which station had the highest average number of entries between midnight & 4am on Fridays in July 2013?

In [14]:
fridays = [i for i in range(1,32) if datetime.datetime(2013,7,i).isoweekday()==5]
fridays = [(datetime.datetime(2013,7,i)-datetime.datetime(2013,1,1)).days for i in fridays]
fridays

[185, 192, 199, 206]

In [15]:
import operator
d = {}
for x in turns:
    summation = np.sum([np.sum(turns[x].ENTRY[i,0]) for i in fridays])
    if turns[x].station_name in d:
        d[turns[x].station_name] += summation
    else:
        d[turns[x].station_name] = summation
sorted(d.items(), key=operator.itemgetter(1))[-5:]

[('W 4 ST-WASH SQ', 13250),
 ('34 ST-PENN STA', 19356),
 ('42 ST-PA BUS TE', 21553),
 ('14 ST-UNION SQ', 23022),
 ('42 ST-TIMES SQ', 23885)]

The '42 ST-TIMES SQ' station had the highest average number of entries between midnight & 4am on Fridays in July 2013, which is 23885.

## 5. What stations have seen the most usage growth/decline in the last year?

In [16]:
def split_log_lines2(input_filename, raw_data_list):
    fp = open(input_filename, 'r')
    for line in fp:
        dat = line.strip().split(',')
        if len(dat)==11:
            date_temp = dat[6].split('/')
            dat[6] = '-'.join(date_temp[2:] + date_temp[0: 2])
            if dat[10] != 'EXITS':
                raw_data_list.append(','.join(dat[0:3] + dat[6:8] + dat[9:11]))
        else:
            for i in range(3, len(dat), 5):
                date_temp = dat[i].split('-')
                dat[i] = '20' + '-'.join(date_temp[2: 3] + date_temp[0: 2])
                raw_data_list.append(','.join(dat[0:3] + dat[i:i + 2] + dat[i + 3:i + 5]))
    fp.close()
def raw_data_list_f(file_head,addi_file_name):
    raw_data_list = []
    logs_y = [filename for filename in os.listdir(
        './') if filename.find(file_head) >= 0 and filename.find('.txt') >= 0]
    for filename in logs_y:
        split_log_lines2(filename, raw_data_list)
    split_log_lines2(addi_file_name, raw_data_list)
    return raw_data_list
def turns_f(raw_data_list,year):
    turns = {}
    curr_turn = raw_data_list[0].split(',')
    turn_start_index = 0
    for i in xrange(len(raw_data_list)):
        line = raw_data_list[i].split(',')
        if line[0] != curr_turn[0] or line[1] != curr_turn[1] or line[2] != curr_turn[2]:
            turns['_'.join(curr_turn[0:3])] = turnstile(raw_data_list[turn_start_index: i], year)
            turn_start_index = i
            #if line[0] != curr_turn[0]:
            #    print '{percent:.2%}'.format(percent=float(i)/len(raw_data_list)) + ' logs finished'
            curr_turn = line
    return turns

In [17]:
raw_data_list14=raw_data_list_f('turnstile_14','turnstile_150103.txt')
raw_data_list14.sort()

There's a bad record in the raw data files and here we fix it by popping the bad data out of the raw_data_list14

In [18]:
count=0
for i in raw_data_list14:
    if len(i.split(','))<7:
        print i
        print count
        break
    count=count+1

N329A,R201,01-06-00,20REGUL
4701943


In [19]:
raw_data_list14[4701943:4701947]

['N329A,R201,01-06-00,20REGUL',
 'N329A,R201,01-06-00,20WOODHAVEN BLVD,MR,12/02/2014,16:00:00',
 'N329A,R201,01-06-01,2013-12-28,00:00:00,005611151,002672493',
 'N329A,R201,01-06-01,2013-12-28,04:00:00,005611157,002672538']

In [20]:
raw_data_list14.pop(4701943)
raw_data_list14.pop(4701943)

'N329A,R201,01-06-00,20WOODHAVEN BLVD,MR,12/02/2014,16:00:00'

In [21]:
turns14=turns_f(raw_data_list14,2014)

In [22]:
raw_data_list15=raw_data_list_f('turnstile_15','turnstile_160102.txt')
raw_data_list15.sort()

In [23]:
turns15=turns_f(raw_data_list15,2015)

In [24]:
import pandas as pd
def turns_year_count(turns):
    c = {}
    for x in turns:
        summation = turns[x].busyness
        if turns[x].station_name in c:
            c[turns[x].station_name] += summation
        else:
            c[turns[x].station_name] = summation
    return pd.DataFrame(c.items(),columns=('Station','AnnualCounts'))

In [25]:
Dic14=turns_year_count(turns14)
Dic15=turns_year_count(turns15)
Dic1415=pd.merge(Dic14,Dic15,left_on=['Station'], right_on=['Station'], how='inner')
Dic1415['OffSet']=Dic1415['AnnualCounts_y']-Dic1415['AnnualCounts_x']
Dic1415.columns=('Station','AnnualCounts2014','AnnualCounts2015','OffSet')

The Station that have seen the most usage growth in the last year:

In [26]:
Dic1415[Dic1415['OffSet']==max(Dic1415['OffSet'])]

Unnamed: 0,Station,AnnualCounts2014,AnnualCounts2015,OffSet
214,WHITEHALL ST,4682873,9377889,4695016


The Station that have seen the most usage decline in the last year:

In [27]:
Dic1415[Dic1415['OffSet']==min(Dic1415['OffSet'])]

Unnamed: 0,Station,AnnualCounts2014,AnnualCounts2015,OffSet
5,57 ST-7 AVE,15030482,10775813,-4254669


## 6.1 What dates are the least busy?

In [28]:
DayTotal=[0]*365
for x in turns15:
    a=[np.sum(turns15[x].ENTRY[i,:]) for i in range(365)]
    day_count = range(1,366)
    DayTotal=map(lambda x,y:x+y,DayTotal,a)

In [29]:
DayTotal = sorted(zip(range(1,365),DayTotal), key=lambda x:x[1])
DayTotal[:5]

[(27, 1365488), (359, 1991703), (18, 2203244), (46, 2393195), (330, 2400279)]

In [30]:
print (datetime.datetime(2015,1,1)+datetime.timedelta(days=DayTotal[0][0]-1)).date()

2015-01-27


2015-01-27 is the least busy. The other four dates are:

In [32]:
for i in range(1,5,1):
    print (datetime.datetime(2015,1,1)+datetime.timedelta(days=DayTotal[i][0]-1)).date()

2015-12-25
2015-01-18
2015-02-15
2015-11-26


## 6.2 Could you identify days on which stations were not operating at full capacity or closed entirely?

Here I build a class for the Station objects. Note that I defined a station a low capacity one if only 50% of its tenstiles are in use.

In [33]:
class Station:
    __low_capacity_limit__ = 0.5
    
    def __init__(self, station_name):
        self.name = station_name
        self.turn_num_year = {}
        self.turn_name_year = {}
        self.turn_day_year = {}
        self.low_capacity_days = {}
        self.closed_days = {}
        self.turnstiles = []

    def add_turn(self, turn):
        curr_turn_name = '%s_%s_%s' % (turn.CA, turn.UNIT, turn.SCP)
        if turn.year in self.turn_num_year:
            self.turn_num_year[turn.year] += 1
        else:
            self.turn_num_year[turn.year] = 1
        if turn.year in self.turn_name_year:
            self.turn_name_year[turn.year].append(curr_turn_name)
        else:
            self.turn_name_year[turn.year] = [curr_turn_name]
        if curr_turn_name not in self.turnstiles:
            self.turnstiles.append(curr_turn_name)
        if turn.year not in self.turn_day_year:
            self.turn_day_year[turn.year] = [0]*turn.days_per_year
        for i in xrange(turn.days_per_year):
            if np.sum(turn.EXIT[i]) + np.sum(turn.ENTRY[i]) > 0:
                self.turn_day_year[turn.year][i] += 1
    
    def get_closed_days(self, year):
        closed_days = []
        if year not in self.turn_day_year:
            self.closed_days[year] = ['NaN']
            return ['Closed during the year %d' % year]
        for i in xrange(len(self.turn_day_year[year])):
            if self.turn_day_year[year][i] == 0:
                closed_days.append(self.get_date(year, i))
        self.closed_days[year] = closed_days
        return closed_days
    
    def get_low_capacity_days(self, year):
        low_capacity_days = []
        if year not in self.turn_day_year:
            self.low_capacity_days[year] = ['NaN']
            return ['Closed during the year %d' % year]
        for i in xrange(len(self.turn_day_year[year])):
            if self.turn_day_year[year][i] < self.turn_num_year[year] * self.__low_capacity_limit__:
                low_capacity_days.append(self.get_date(year, i))
        self.low_capacity_days[year] = low_capacity_days
        return low_capacity_days
    
    def get_date(self, year, i):
        return str((datetime.datetime(year,1,1)+datetime.timedelta(days=i)).date())   


In [34]:
station_all = {}
for x in station_name:
    if station_name[x] not in station_all:
        station_all[station_name[x]] = Station(station_name[x])
def parse_turns(turns, station_all):
    for x in turns:
        if turns[x].station_name != 'Unknown':
            station_all[turns[x].station_name].add_turn(turns[x])
parse_turns(turns, station_all)
parse_turns(turns14, station_all)
parse_turns(turns15, station_all)
for x in station_all:
    for year in xrange(2013, 2016):
        station_all[x].get_low_capacity_days(year)
        station_all[x].get_closed_days(year)

Here I built the dictionary of all the station objects with their station names as the keys. The station objects are instantiated in this step and one can readily look up the close_days/low_capacity_days information by supplying the station name.

In [35]:
print station_all['CENTRAL AVE'].closed_days
print station_all['CENTRAL AVE'].low_capacity_days

{2013: ['2013-10-20', '2013-12-08'], 2014: [], 2015: []}
{2013: ['2013-08-25', '2013-09-22', '2013-10-20', '2013-11-03', '2013-11-23', '2013-11-24', '2013-12-08'], 2014: ['2014-05-25'], 2015: []}


Above is an example of looking up the closed_days and low_capacity_days information (for 'CENTRAL AVE') from the station_all dictionary. It is found 'CENTRAL AVE' station is closed in '2013-10-20' and '2013-12-08', and is of low capacity on '2013-08-25', '2013-09-22', '2013-10-20', '2013-11-03', '2013-11-23', '2013-11-24', '2013-12-08' and '2014-05-25'.