In [1]:
import pandas as pd
import urllib.request
import os
import sqlite3
from pathlib import Path
import requests
import json
import time
import numpy as np
from scipy import spatial

In [2]:
#starting with importing the data for our top 5 stations 
#import top ten df
top_five = pd.read_csv('top_5_stations.csv')
top_five.head()

Unnamed: 0,WEEK,STATION_FULL,TRAFFIC,PERCENT
0,2,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,355961.0,25.900901
1,3,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,339523.0,25.207081
2,4,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,334801.0,26.566511
3,5,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,282419.0,21.289464
4,6,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,370934.0,26.986773


In [3]:
top_five = top_five[top_five["WEEK"]==30].sort_values("TRAFFIC", ascending = False).reset_index(drop=True)
top_five.head()


Unnamed: 0,WEEK,STATION_FULL,TRAFFIC,PERCENT
0,30,GRD_CNTRL_42_ST_Line_4567S_2021,623895.0,36.435314
1,30,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,540967.0,38.07873
2,30,42_ST_PORT_AUTH_Line_ACENQRS1237W_2021,414967.0,41.140016
3,30,34_ST_PENN_STA_Line_ACE_2021,390287.0,41.203764
4,30,PATH_NEW_WTC_Line_1_2021,381333.0,52.587428


In [4]:
#add rank column according to traffic 

top_five["T_RANK"] = top_five.index+1
top_five.head()

Unnamed: 0,WEEK,STATION_FULL,TRAFFIC,PERCENT,T_RANK
0,30,GRD_CNTRL_42_ST_Line_4567S_2021,623895.0,36.435314,1
1,30,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,540967.0,38.07873,2
2,30,42_ST_PORT_AUTH_Line_ACENQRS1237W_2021,414967.0,41.140016,3
3,30,34_ST_PENN_STA_Line_ACE_2021,390287.0,41.203764,4
4,30,PATH_NEW_WTC_Line_1_2021,381333.0,52.587428,5


In [5]:
#create a new data frame of station names and remove the "_2021" designation 

stations = top_five.STATION_FULL.str[:-5].unique()
top_five_loc = pd.DataFrame({"STATION": stations})
top_five_loc['STATION'] = top_five_loc['STATION'].str.split('_Line').str[0]

#add rank column according to traffic


top_five_loc.head()

Unnamed: 0,STATION
0,GRD_CNTRL_42_ST
1,34_ST_HERALD_SQ
2,42_ST_PORT_AUTH
3,34_ST_PENN_STA
4,PATH_NEW_WTC


In [6]:
#change names of stations for google api match

top_five_loc.loc[0, 'STATION'] = "GRAND_CENTRAL"
top_five_loc.loc[2, 'STATION'] = "42_ST_PORT_AUTHORITY"
top_five_loc.loc[3, 'STATION'] = "PENN_STATION_ACE"
top_five_loc.head()


Unnamed: 0,STATION
0,GRAND_CENTRAL
1,34_ST_HERALD_SQ
2,42_ST_PORT_AUTHORITY
3,PENN_STATION_ACE
4,PATH_NEW_WTC


In [7]:
#create places variable ready for passing to url 

API_KEY = "AIzaSyDIHae1vRK3lp9mrWbDHKA4wyNozfm9IGw"
places = top_five_loc["STATION"].str.replace("_", "+") + "+mta+subway+NY+NY"
req1 = "https://maps.googleapis.com/maps/api/geocode/json?address="
req2 = "&key=" + API_KEY

In [8]:
#function for getting coordinates for the stations from google api 

def get_loc(places,req1,req2):

    locs = {}
    counter = 0
    
    for place in places:
        r = requests.get(req1 + place + req2)
        
        try:
            results = r.json()['results'][0][u'geometry'][u'location']
            lat = results[u'lat']
            lng = results[u'lng']
            locs[place] = [lat,lng]
            
        except:
            pass
            
    return locs

In [9]:
#get coordinates for top 5 stations

loc = get_loc(places,req1,req2)

In [10]:
#convert dictionary to dataframe and transpose 



loc_five = pd.DataFrame(loc)
loc_five = loc_five.T.reset_index()

#rename columns

loc_five = loc_five.rename(columns={"index": "STATION", 0: "LAT", 1:"LONG"})

#change names back to match traffic data 

loc_five["STATION"] = top_five.STATION_FULL.unique()


loc_five.head()

Unnamed: 0,STATION,LAT,LONG
0,GRD_CNTRL_42_ST_Line_4567S_2021,40.752469,-73.977487
1,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,40.750087,-73.988127
2,42_ST_PORT_AUTH_Line_ACENQRS1237W_2021,40.756757,-73.990437
3,34_ST_PENN_STA_Line_ACE_2021,40.751992,-73.993323
4,PATH_NEW_WTC_Line_1_2021,40.711574,-74.011449


In [11]:
#function for getting starbucks locations for stations of interest

def get_bucks(df):

    for i,rows in df.iterrows():
    
     
        
            API_KEY = "AIzaSyDIHae1vRK3lp9mrWbDHKA4wyNozfm9IGw"
            req1 = "https://maps.googleapis.com/maps/api/place/textsearch/json?query=starbucks&location="
            req2 = "&radius=10&region=us&type=cafe&key=" + API_KEY
            endpoint= req1 + str(df.loc[i,'LAT'])+","+str(df.loc[i,'LONG']) + req2
            station = df.loc[i,"STATION"]
            
        
            coffee_shops = []
            params = {}
  
            endpoint_url = endpoint
            res = requests.get(endpoint_url, params = params)
            results =  json.loads(res.content)
            coffee_shops.extend(results['results'])
            time.sleep(2)
            
            while "next_page_token" in results:
                    params['pagetoken'] = results['next_page_token'],
                    res = requests.get(endpoint_url, params = params)
                    results = json.loads(res.content)
                    coffee_shops.extend(results['results'])
                    time.sleep(2)
        
            shop_name = []
            shop_address = []
            shop_lat = []
            shop_lon = []
            business_status = []


            for i in range(len(coffee_shops)): 
    
                shop = coffee_shops[i]
    
                try:
                        shop_name.append(shop['name'])
                except:
                        shop_name.append('none')
        
                try: 
                        shop_address.append(shop['formatted_address'])
                except:
                        shop_address.append('none')
        
    
                try: 
                        shop_lat.append(shop['geometry']['location']['lat'])
                except:
                        shop_lat.append('none')
        
                try: 
                        shop_lon.append(shop['geometry']['location']['lng'])
                except:
                        shop_lon.append('none')
        
                try:
                        business_status.append(shop['business_status'])
                    
                except:
                        business_status.append('none')
                    
                    
            df_dict = {'NAME': shop_name, "ADDRESS":shop_address, "LAT" : shop_lat, "LONG": shop_lon, "STATUS": business_status} 
            output = pd.DataFrame(df_dict)
            
            
            
            return(output)
            

In [12]:
#get starbucks locations for station of interest using google api 
#WTC

def get_nearest(station):



    starbucks_station = get_bucks(station)


#clean the data set & make sure there are no duplicates

    starbucks = starbucks_station.copy()
    starbucks["ADDRESS"] = starbucks["ADDRESS"].str[:-5]
    starbucks = starbucks.drop_duplicates(subset=['ADDRESS']).reset_index(drop=True)
    starbucks.ADDRESS.nunique()

#find the closest three starbucks shops for our location of interest
#create tuple and index from starbuckscoordinate data for passing to kdtree 


    starbucks_coords = list(zip(starbucks.LAT, starbucks.LONG))
    starbucks_index = starbucks.index

#create the reference coordinates for the KDtree seach using WTC data from our location df 
    loc_coords = list(zip(station.LAT, station.LONG))
    

#Run KDTree search on our list of Starbucks locations and identify 
#top 3 closest locations 


    tree = spatial.KDTree(starbucks_coords)
    idxs  = tree.query(loc_coords, 3)
    locations = [starbucks_index[idx] for idx in idxs[1]]
    
    #create df of results from KDTree
    shop_ind = locations[0].tolist()
    shop_locations = starbucks.iloc[shop_ind].reset_index(drop=True)
    
    #add station name to df 
    shop_locations["STATION"] = station.loc[0,"STATION"]
    
    
    
    return(shop_locations)



In [13]:
#Starbucks locations for station 1 in top 5 list 

station = pd.DataFrame(loc_five.loc[0,:]).T.reset_index()
starbucks_1 = get_nearest(station)
starbucks_1

Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION
0,Starbucks,"125 Park Ave, New York, NY",40.751774,-73.97765,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021
1,Starbucks,"7800 Grand Central Station Track 35, New York,...",40.75327,-73.97761,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021
2,Starbucks,"340 Madison Ave, New York, NY",40.753901,-73.978693,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021


In [14]:
#Starbucks locations for station 2 in top 5 list 

station = pd.DataFrame(loc_five.loc[1,:]).T.reset_index()
starbucks_2 = get_nearest(station)
starbucks_2

Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION
0,Starbucks,"977 6th Ave, New York, NY",40.751009,-73.987141,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021
1,Starbucks,"151 W 34th St, New York, NY",40.750843,-73.98943,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021
2,Starbucks,"875 6th Ave, New York, NY",40.748154,-73.989263,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021


In [15]:
#Starbucks locations for station 3 in top 5 list 

station = pd.DataFrame(loc_five.loc[2,:]).T.reset_index()
starbucks_3 = get_nearest(station)
starbucks_3

Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION
0,Starbucks,"625 8th Ave, New York, NY",40.75692,-73.9917,OPERATIONAL,42_ST_PORT_AUTH_Line_ACENQRS1237W_2021
1,Starbucks,"684 8th Ave, New York, NY",40.758001,-73.989036,OPERATIONAL,42_ST_PORT_AUTH_Line_ACENQRS1237W_2021
2,Starbucks,"593 9th Ave, New York, NY",40.758994,-73.992474,OPERATIONAL,42_ST_PORT_AUTH_Line_ACENQRS1237W_2021


In [16]:
#Starbucks locations for station 4 in top 5 list 

station = pd.DataFrame(loc_five.loc[3,:]).T.reset_index()
starbucks_4 = get_nearest(station)
starbucks_4

Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION
0,Starbucks,"494 8th Ave, New York, NY",40.752687,-73.992859,OPERATIONAL,34_ST_PENN_STA_Line_ACE_2021
1,Starbucks,"Moynihan Train Hall, 383 W 31st St, New York, NY",40.75143,-73.99458,OPERATIONAL,34_ST_PENN_STA_Line_ACE_2021
2,Starbucks,"Pennsylvania Station, Amtrak Main Course, Penn...",40.75034,-73.993,OPERATIONAL,34_ST_PENN_STA_Line_ACE_2021


In [17]:
#Starbucks locations for station 5 in top 5 list 

station = pd.DataFrame(loc_five.loc[4,:]).T.reset_index()
starbucks_5 = get_nearest(station)
starbucks_5

Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION
0,Starbucks,"250 Greenwich St, New York, NY",40.712135,-74.010813,OPERATIONAL,PATH_NEW_WTC_Line_1_2021
1,Starbucks,"185 Greenwich St, New York, NY",40.712136,-74.010806,OPERATIONAL,PATH_NEW_WTC_Line_1_2021
2,Starbucks,"195 Broadway, New York, NY",40.71091,-74.010363,CLOSED_TEMPORARILY,PATH_NEW_WTC_Line_1_2021


In [18]:
#concat all starbucks locations into one dataframe

starbucks_ind = [starbucks_1, starbucks_2, starbucks_3, starbucks_4, starbucks_5]
starbucks_result = pd.concat(starbucks_ind)


In [19]:
#reset index for the starbucks df 

starbucks_result = starbucks_result.reset_index(drop=True)


In [20]:
#drop any duplicates 

starbucks_result = starbucks_result.drop_duplicates(subset=['ADDRESS']).reset_index(drop=True)
starbucks_result.ADDRESS.nunique()

15

In [21]:
#add traffic rank into starbucks results 

starbucks_res= pd.merge(starbucks_result,top_five[['STATION_FULL','T_RANK']],left_on='STATION', right_on="STATION_FULL", how='left')


In [22]:
#drop the duplicate column of station name and sort by traffic rank 
starbucks_res = starbucks_res.drop(columns=["STATION_FULL"])
starbucks_res = starbucks_res.sort_values("T_RANK")
starbucks_res.head()

Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION,T_RANK
0,Starbucks,"125 Park Ave, New York, NY",40.751774,-73.97765,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1
1,Starbucks,"7800 Grand Central Station Track 35, New York,...",40.75327,-73.97761,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1
2,Starbucks,"340 Madison Ave, New York, NY",40.753901,-73.978693,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1
3,Starbucks,"977 6th Ave, New York, NY",40.751009,-73.987141,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,2
4,Starbucks,"151 W 34th St, New York, NY",40.750843,-73.98943,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,2


In [23]:
#load percent data for top ten station

top_percent = pd.read_csv("./top_percent_2021_in_2019.csv", header = 0)
top_percent.head()


Unnamed: 0,WEEK,STATION_FULL,TRAFFIC,PERCENT
0,2,14_ST_UNION_SQ_Line_LNQR456W_2021,177299,23.199691
1,3,14_ST_UNION_SQ_Line_LNQR456W_2021,171502,23.059744
2,4,14_ST_UNION_SQ_Line_LNQR456W_2021,170188,23.435514
3,5,14_ST_UNION_SQ_Line_LNQR456W_2021,152808,20.223532
4,6,14_ST_UNION_SQ_Line_LNQR456W_2021,182246,23.024115


In [24]:
#convert percent and week column to numeric

top_percent["WEEK"] = pd.to_numeric(top_percent["WEEK"])
top_percent["PERCENT"] = pd.to_numeric(top_percent["PERCENT"])
#select data for last week in dataset and sort by top percentage first 

top_per_30 = top_percent[top_percent["WEEK"]== 30].sort_values("PERCENT", ascending=False)

#choose stations with percentage value above the 2021 average

top_per = top_per_30[top_per_30["PERCENT"] > 45.402703].reset_index(drop=True)
top_per.head()

Unnamed: 0,WEEK,STATION_FULL,TRAFFIC,PERCENT
0,30,FLUSHING_MAIN_Line_7_2021,363942,55.141474
1,30,JKSN_HT_ROOSVLT_Line_EFMR7_2021,290320,54.872609
2,30,PATH_NEW_WTC_Line_1_2021,381333,52.587428
3,30,86_ST_Line_456_2021,255142,49.703794


In [25]:
#assign rank to top stations

top_per["P_RANK"] = top_per.index + 1
top_per.head()

Unnamed: 0,WEEK,STATION_FULL,TRAFFIC,PERCENT,P_RANK
0,30,FLUSHING_MAIN_Line_7_2021,363942,55.141474,1
1,30,JKSN_HT_ROOSVLT_Line_EFMR7_2021,290320,54.872609,2
2,30,PATH_NEW_WTC_Line_1_2021,381333,52.587428,3
3,30,86_ST_Line_456_2021,255142,49.703794,4


In [26]:
#add percent rank into starbucks results 

starbucks_results= pd.merge(starbucks_res,top_per[['STATION_FULL','P_RANK']],left_on='STATION', right_on="STATION_FULL", how='left')

#drop the duplicate column of station name and fill NA with 0 
starbucks_results = starbucks_results.drop(columns=["STATION_FULL"])
starbucks_results = starbucks_results.fillna(10)
starbucks_results = starbucks_results.astype({"P_RANK": int})

starbucks_results.head()


Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION,T_RANK,P_RANK
0,Starbucks,"125 Park Ave, New York, NY",40.751774,-73.97765,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1,10
1,Starbucks,"7800 Grand Central Station Track 35, New York,...",40.75327,-73.97761,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1,10
2,Starbucks,"340 Madison Ave, New York, NY",40.753901,-73.978693,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1,10
3,Starbucks,"977 6th Ave, New York, NY",40.751009,-73.987141,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,2,10
4,Starbucks,"151 W 34th St, New York, NY",40.750843,-73.98943,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,2,10


In [27]:
#finally, sort first by percent rank and then by traffic to get complete list of starbucks to focus on
#ranked first by potential growth and then by traffic at location 


starbucks_ranked = starbucks_results.sort_values(['P_RANK', 'T_RANK'], ascending=[True, True]).reset_index(drop=True)
starbucks_ranked

Unnamed: 0,NAME,ADDRESS,LAT,LONG,STATUS,STATION,T_RANK,P_RANK
0,Starbucks,"250 Greenwich St, New York, NY",40.712135,-74.010813,OPERATIONAL,PATH_NEW_WTC_Line_1_2021,5,3
1,Starbucks,"185 Greenwich St, New York, NY",40.712136,-74.010806,OPERATIONAL,PATH_NEW_WTC_Line_1_2021,5,3
2,Starbucks,"195 Broadway, New York, NY",40.71091,-74.010363,CLOSED_TEMPORARILY,PATH_NEW_WTC_Line_1_2021,5,3
3,Starbucks,"125 Park Ave, New York, NY",40.751774,-73.97765,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1,10
4,Starbucks,"7800 Grand Central Station Track 35, New York,...",40.75327,-73.97761,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1,10
5,Starbucks,"340 Madison Ave, New York, NY",40.753901,-73.978693,OPERATIONAL,GRD_CNTRL_42_ST_Line_4567S_2021,1,10
6,Starbucks,"977 6th Ave, New York, NY",40.751009,-73.987141,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,2,10
7,Starbucks,"151 W 34th St, New York, NY",40.750843,-73.98943,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,2,10
8,Starbucks,"875 6th Ave, New York, NY",40.748154,-73.989263,OPERATIONAL,34_ST_HERALD_SQ_Line_BDFMNQRW_2021,2,10
9,Starbucks,"625 8th Ave, New York, NY",40.75692,-73.9917,OPERATIONAL,42_ST_PORT_AUTH_Line_ACENQRS1237W_2021,3,10


In [31]:
#saving starbucks data for plotting

starbucks_ranked.to_csv("./starbucks_ranked.csv", index=False)