In [5]:
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import lxml
import pandas as pd
import re
import time
import os
import csv
pd.set_option('display.max_columns', 99)
pd.set_option('display.max_colwidth', -1)

### Find Cities to Scrape

For this analysis, we are interested in the top 30 cities by population.  To find this information, lets scrape it from a reliable, up-to-date source found at this [Wikipedia link]('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population').

In [6]:
HEADERS = {'User-Agent':'Mozilla/6.0'}
BASE_URL = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
response = requests.get(BASE_URL,headers=HEADERS)
soup = BeautifulSoup(response.content, "html5lib")

Locate the table containing cities and populations.

In [7]:
table = soup.select_one('table.sortable.wikitable')

For each row in the population table, pull the city name.

In [8]:
city_list = []
for row in table.select('tr'):
    city = row.find('a').text
    city_list.append(city)
#Truncate the list to limit to first 50 cities
city_list = city_list[1:51]
#Correct 'Washington' to 'Washington, D.C.'
city_list[20] = "Washington, D.C"

In [9]:
city_list

['New York',
 'Los Angeles',
 'Chicago',
 'Houston',
 'Phoenix',
 'Philadelphia',
 'San Antonio',
 'San Diego',
 'Dallas',
 'San Jose',
 'Austin',
 'Jacksonville',
 'San Francisco',
 'Columbus',
 'Indianapolis',
 'Fort Worth',
 'Charlotte',
 'Seattle',
 'Denver',
 'El Paso',
 'Washington, D.C',
 'Boston',
 'Detroit',
 'Nashville',
 'Memphis',
 'Portland',
 'Oklahoma City',
 'Las Vegas',
 'Louisville',
 'Baltimore',
 'Milwaukee',
 'Albuquerque',
 'Tucson',
 'Fresno',
 'Sacramento',
 'Mesa',
 'Kansas City',
 'Atlanta',
 'Long Beach',
 'Colorado Springs',
 'Raleigh',
 'Miami',
 'Virginia Beach',
 'Omaha',
 'Oakland',
 'Minneapolis',
 'Tulsa',
 'Arlington',
 'New Orleans',
 'Wichita']

### Pull Yelp.com's business links and store in list

In [10]:
len(city_list)

50

In [None]:
page_links = []

for i, city in enumerate(city_list):
    BASE_URL = 'http://www.yelp.com/search?find_desc=Bars&find_loc=' + city + '&start='
    page_counts = [0,10,20,30]
    for page_limit in page_counts:
        response = requests.get(BASE_URL +  str(page_limit) + '&sortby=review_count',headers=HEADERS)
        soup = BeautifulSoup(response.content, "html5lib")
        links = soup.findAll("a", { "class" : "biz-name js-analytics-click" })
        for link in links:
            page_links.append(link['href'])
        time.sleep(5)
    print(city)
#     if i % 5 == 0:
#         print(float(i/len(city_list)))

In [166]:
dir = "/Users/Collier/Dropbox/Skills/Python/Projects/Culture/Cities_Bars_Yelp/"
os.chdir(dir)
page_links_save = pd.DataFrame(page_links)
page_links_save.to_csv(dir+'business_links.csv', index=False)

In [11]:
#read in from pc
dir = "C:\\Users\\David\\Dropbox\\Skills\\Python\\Projects\\Culture\\Cities_Bars_Yelp\\"
page_links = pd.read_csv(dir+"business_links.csv")

In [12]:
len(page_links)

2004

### Loop Through Yelp Links and Scrape Features

We've pulled links for about 1200 bars across 30 cities.  Now, we will scrape each Yelp page and pull the details for each business.

In [13]:
BASE_URL = 'http://www.yelp.com'

In [14]:
cols = ['Name','Rating','Reviews','Area','Address',
'Takes Reservations','Parking','Bike Parking',
'Good for Groups','Ambience','Noise Level',
'Music','Good For Dancing','Alcohol','Happy Hour',
'Best Nights','Smoking','Outdoor Seating','Wi-Fi',
'Has TV','Waiter Service','Has Pool Table','Price_Range','Price_Level']
df_master = pd.DataFrame(columns=cols)
df_master.reset_index(drop=True,inplace=True)
df_master_all = df_master

In [15]:
df_master_all.head(3)

Unnamed: 0,Name,Rating,Reviews,Area,Address,Takes Reservations,Parking,Bike Parking,Good for Groups,Ambience,Noise Level,Music,Good For Dancing,Alcohol,Happy Hour,Best Nights,Smoking,Outdoor Seating,Wi-Fi,Has TV,Waiter Service,Has Pool Table,Price_Range,Price_Level


In [16]:
page_links.tail()

Unnamed: 0,0
1999,/biz/lemongrass-taste-of-vietnam-wichita?osq=Bars
2000,/biz/bubbas-33-wichita?osq=Bars
2001,/biz/logans-roadhouse-wichita-2?osq=Bars
2002,/biz/heroes-sports-bar-and-grill-wichita-2?osq=Bars
2003,/biz/rain-caf%C3%A9-and-lounge-wichita-2?osq=Bars


In [17]:
page_links2 = page_links['0'].values.tolist()

In [18]:
# for idx, link in enumerate(page_links):
for idx, link in enumerate(page_links2):
    response = requests.get(BASE_URL + link, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html5lib")
    #try to find name of establishment
    try:
        name = soup.find("h1", {"class":"biz-page-title embossed-text-white"}).contents[0].strip()
    except:
        #if cannot find name, look at other name location, if cannot find that skip row
        try:
            name = soup.find("h1", {"class":"biz-page-title embossed-text-white shortenough"}).contents[0].strip()
        except:
            continue
    feature_attr = soup.select_one('div.short-def-list').select('dt.attribute-key')
    feature_response = soup.select_one('div.short-def-list').select('dd')
    response = []
    #if we successfully scraped the features, loop through and pull text
    if feature_response:
        for tag in feature_response:
            text = tag.text.strip()
            response.append(text)
    #if the features are missing, skip row
    else:
        continue
    attr = []
    #if we successfully scraped the features, loop through and pull text
    if feature_attr:
        for tag in feature_attr:
            text = tag.text.strip()
            attr.append(text)
    #if the features are missing, skip row
    else:
        continue
    df_bus_info = pd.DataFrame({'VenueAttr_Desc':attr,'VenueAttr_Status':response})
    df_bus_info = pd.DataFrame(response,attr)
    df_bus_info = df_bus_info.T
    try:
        pricing = soup.find("dd", { "class" : "nowrap price-description" }).contents[0].strip()
        pricing = pricing.replace("$","")
    except:
        pass
    try:
        pricing_level = soup.find("span", {"class":"business-attribute price-range"}).contents[0].strip()
    except:
        pass
    try:
        ratings = soup.find('span', {"class" : "review-count rating-qualifier" }).contents[0].strip()
        ratings = ratings.replace(" reviews","")
    except:
        pass
    try:
        stars = str(soup.find('div', {'class': 'biz-rating biz-rating-very-large clearfix'}).contents[1])
        stars = stars[stars.find("title="):]
        stars = stars[:stars.find("rating")]
        stars = stars[stars.find("="):]
        stars = stars[:stars.find(" star")]
        stars = re.sub('[^a-zA-Z0-9 \n\.]', '', stars)
    except:
        pass
    try:
        address = str(soup.find('strong',{'class':'street-address'}).contents[1])
        address = address[:address.find("\n    </address>")]
        address = re.sub("<address>\n        ","",address)
        address = re.sub("<br/>"," ",address)
    except:
        pass
    try:
        area = soup.find('span',{'class':'neighborhood-str-list'}).contents[0]
        area = re.sub("\n            ","",area)
        area = re.sub("        ","",area)
    except:
        pass
    cols = ['Name','Rating','Reviews','Area','Address',
    'Takes Reservations','Parking','Bike Parking',
    'Good for Groups','Ambience','Noise Level',
    'Music','Good For Dancing','Alcohol','Happy Hour',
    'Best Nights','Smoking','Outdoor Seating','Wi-Fi',
    'Has TV','Waiter Service','Has Pool Table','Price_Range','Price_Level']
    df_master = pd.DataFrame(columns=cols,index=[0])
    df_master.reset_index(inplace=True)
    try:
        df_master.loc[0, 'Name'] = name
    except:
        pass
    try:
        df_master.loc[0, 'Rating'] = stars
    except:
        pass
    try:
        df_master.loc[0, 'Reviews'] = ratings
    except:
        pass
    try:
        df_master.loc[0, 'Area'] = area
    except:
        pass
    try:
        df_master.loc[0, 'Address'] = address
    except:
        pass
    try:
        df_master.loc[0, 'Takes Reservations'] = df_bus_info.loc[0,'Takes Reservations']
    except:
        pass
    try:
        df_master.loc[0, 'Parking'] = df_bus_info.loc[0,'Parking']
    except:
        pass
    try:    
        df_master.loc[0, 'Bike Parking'] = df_bus_info.loc[0,'Bike Parking']
    except:
        pass
    try:
        df_master.loc[0, 'Good for Groups'] = df_bus_info.loc[0,'Good for Groups']
    except:
        pass
    try:
        df_master.loc[0, 'Ambience'] = df_bus_info.loc[0,'Ambience']
    except:
        pass
    try:
        df_master.loc[0, 'Noise Level'] = df_bus_info.loc[0,'Noise Level']
    except:
        pass
    try:
        df_master.loc[0, 'Music'] = df_bus_info.loc[0,'Music']
    except:
        pass
    try:
        df_master.loc[0, 'Good For Dancing'] = df_bus_info.loc[0,'Good For Dancing']
    except:
        pass
    try:
        df_master.loc[0, 'Alcohol'] = df_bus_info.loc[0,'Alcohol']
    except:
        pass
    try:
        df_master.loc[0, 'Happy Hour'] = df_bus_info.loc[0,'Happy Hour']
    except:
        pass
    try:
        df_master.loc[0, 'Best Nights'] = df_bus_info.loc[0,'Best Nights']
    except:
        pass
    try:
        df_master.loc[0, 'Smoking'] = df_bus_info.loc[0,'Smoking']
    except:
        pass
    try:
        df_master.loc[0, 'Outdoor Seating'] = df_bus_info.loc[0,'Outdoor Seating']
    except:
        pass
    try:
        df_master.loc[0, 'Wi-Fi'] = df_bus_info.loc[0,'Wi-Fi']
    except:
        pass
    try:
        df_master.loc[0, 'Has TV'] = df_bus_info.loc[0,'Has TV']
    except:
        pass
    try:
        df_master.loc[0, 'Waiter_Service'] = df_bus_info.loc[0,'Waiter Service']
    except:
        pass
    try:
        df_master.loc[0, 'Has Pool Table'] = df_bus_info.loc[0,'Has Pool Table']
    except:
        pass
    try:
        df_master.loc[0, 'Price_Range'] = pricing
    except:
        pass
    try:
        df_master.loc[0, 'Price_Level'] = pricing_level
    except:
        pass
    df_master_all = df_master_all.append(df_master)
    time.sleep(10)
    if idx % 100 == 0:
        print(float(idx/len(page_links)))

0.0
0.0499001996007984
0.0998003992015968
0.1497005988023952
0.1996007984031936
0.249500998003992
0.2994011976047904
0.34930139720558884
0.3992015968063872
0.4491017964071856
0.499001996007984
0.5489021956087824
0.5988023952095808


### We've successfully pulled destails for 1055 of the 1216 business links.  

In [59]:
df_master_all.shape

(1466, 26)

In [60]:
idx

2003

In [62]:
# dir = "/Users/Collier/Dropbox/Skills/Python/Projects/Culture/Cities_Bars_Yelp/"
dir = "C:\\Users\\David\\Dropbox\\Skills\\Python\\Projects\\Culture\\Cities_Bars_Yelp\\"
os.chdir(dir)
# page_links_save = pd.DataFrame(page_links)
df_master_all.to_csv(dir+'yelp_data_bars_50.csv', index=False)