# Web Scraping

In [1]:
# 분석에 필요한 URL 불러오기
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen, urljoin

In [2]:
url_main = 'https://www.chicagomag.com'
url_sub = '/chicago-magazine/november-2012/best-sandwiches-chicago/'
url = url_main + url_sub
# error 처리를 위한 header 설정
hdr = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'}

In [3]:
# HTTP Error 403 : Forbidden 처리
req = Request(url, headers= hdr)
html = urlopen(req).read()
# error 처리 종료

In [4]:
soup = BeautifulSoup(html, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible">
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<title>The 50 Best Sandwiches in Chicago – Chicago Magazine</title>
<style type="text/css">			.heateorSssInstagramBackground{background:radial-gradient(circle at 30% 107%,#fdf497 0,#fdf497 5%,#fd5949 45%,#d6249f 60%,#285aeb 90%)}
						div.heateor_sss_horizontal_sharing i.heateorSssInstagramBackground{background:#000!important;}div.heateor_sss_standard_follow_icons_container i.heateorSssInstagramBackground{background:#000;}
										.heateor_sss_horizontal_sharing .heateorSssSharing,.heateor_sss_standard_follow_icons_container .heateorSssSharing{
							background-color: #000;
							color: #fff;
						border-width: 0px;
			border-style: solid;
			border-color: transparent;
		}
				.heateor_sss_horizontal_sharing .heateorSssTCBackground{
			color:#666;
		}
				.heateor_sss_horizontal_sharing .heateorSssSharing:ho

In [5]:
print(soup.find_all('div','sammy'))
# Chrome 개발자 도구를 통해 태그와 class 확인

[<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>, <div class="sammy" style="position: relative;">
<div class="sammyRank">2</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Au-Cheval-Fried-Bologna/"><b>Fried Bologna</b><br/>
Au Cheval<br/>
<em>Read more</em> </a></div>
</div>, <div class="sammy" style="position: relative;">
<div class="sammyRank">3</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Xoco-Woodland-Mushroom/"><b>Woodland Mushroom</b><br/>
Xoco<br/>
<em>Read more</em> </a></div>
</div>, <div class="sammy" style="position: relative;">
<div class="sammyRank">4</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Als-Deli-R

In [6]:
# 시카고 샌드위치 맛집이 50개 이므로 div의 sammy 태그의 길이가 50인지 확인
print(len(soup.find_all('div','sammy')))

50


In [7]:
print(soup.find_all('div','sammy')[0])
# 랭크, 메인 메뉴, 가게 이름, 가게 소개 페이지 등의 정보를 확인 가능

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>


## Web Data 추출 및 정제

In [8]:
bs_1 = soup.find_all('div', 'sammy')[0] # bs; BestStore
type(bs_1)

bs4.element.Tag

In [9]:
# sammyRank Class 확인 → 텍스트만 출력
bs_1.find(class_='sammyRank')

<div class="sammyRank">1</div>

In [10]:
# 가게의 랭크 출력
bs_1.find(class_='sammyRank').get_text()

'1'

In [11]:
# 메뉴, 가게의 이름 출력
bs_1.find(class_='sammyListing').get_text()

'BLT\nOld Oak Tap\nRead more '

In [12]:
# 가게 링크
bs_1.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

## 정규식을 통해 메뉴와 가게 이름 분리

In [13]:
import re  # 정규식 (Regular Express)

bs_string = bs_1.find(class_='sammyListing').get_text()

re.split(('\n|\r\n'), bs_string)
# bs_string이 \n 또는 \r\n 일 경우 Split 실행

print(re.split(('\n|\r\n'), bs_string)[0])
print(re.split(('\n|\r\n'), bs_string)[1])

BLT
Old Oak Tap


In [14]:
# 필요한 정보 list 생성
rank = []
main_menu = []
store_name = []
url_add = []

list_soup = soup.find_all('div', 'sammy')

for item in list_soup :
    rank.append(item.find(class_='sammyRank').get_text())
    
    bs_string = item.find(class_='sammyListing').get_text()
    
    main_menu.append(re.split(('\n|\r\n'), bs_string)[0])
    store_name.append(re.split(('\n|\r\n'), bs_string)[1])
    
    url_add.append(urljoin(url_main, item.find('a')['href']))
    # urljoin을 통해 상대경로 URL을 절대경로 URL로 변경

In [15]:
# 변수 length 확인
len(rank), len(main_menu), len(store_name), len(url_add)

(50, 50, 50, 50)

## 데이터를 DataFrame 형태로 변환

In [16]:
import pandas as pd

data = {'Rank' : rank, 'Menu':main_menu, 'Store':store_name, 'URL':url_add}
bs_df = pd.DataFrame(data)
bs_df.head()

Unnamed: 0,Rank,Menu,Store,URL
0,1,BLT,Old Oak Tap,https://www.chicagomag.com/Chicago-Magazine/No...
1,2,Fried Bologna,Au Cheval,https://www.chicagomag.com/Chicago-Magazine/No...
2,3,Woodland Mushroom,Xoco,https://www.chicagomag.com/Chicago-Magazine/No...
3,4,Roast Beef,Al’s Deli,https://www.chicagomag.com/Chicago-Magazine/No...
4,5,PB&L,Publican Quality Meats,https://www.chicagomag.com/Chicago-Magazine/No...


In [17]:
# 데이터 정리
bs_df = pd.DataFrame(data, columns = ['Rank', 'Store', 'Menu', 'URL'])
bs_df.head(10)

Unnamed: 0,Rank,Store,Menu,URL
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...
3,4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...
4,5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...
5,6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,https://www.chicagomag.com/Chicago-Magazine/No...
6,7,Acadia,Lobster Roll,https://www.chicagomag.com/Chicago-Magazine/No...
7,8,Birchwood Kitchen,Smoked Salmon Salad,https://www.chicagomag.com/Chicago-Magazine/No...
8,9,Cemitas Puebla,Atomica Cemitas,https://www.chicagomag.com/Chicago-Magazine/No...
9,10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,https://www.chicagomag.com/Chicago-Magazine/No...


In [18]:
# csv로 저장
bs_df.to_csv('./03. best_sandwiches_list_chicago.csv', encoding = 'utf-8', sep = ',', index=False)

## 다수의 Web Page Scraping

In [19]:
bs_df['URL'][0]  # 시카고 매거진의 또 다른 페이지

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [20]:
req = Request(bs_df['URL'][0], headers= hdr)
html = urlopen(req).read()

soup_tmp = BeautifulSoup(html, 'html.parser')
soup_tmp

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible">
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<title>1. Old Oak Tap BLT – Chicago Magazine</title>
<style type="text/css">			.heateorSssInstagramBackground{background:radial-gradient(circle at 30% 107%,#fdf497 0,#fdf497 5%,#fd5949 45%,#d6249f 60%,#285aeb 90%)}
						div.heateor_sss_horizontal_sharing i.heateorSssInstagramBackground{background:#000!important;}div.heateor_sss_standard_follow_icons_container i.heateorSssInstagramBackground{background:#000;}
										.heateor_sss_horizontal_sharing .heateorSssSharing,.heateor_sss_standard_follow_icons_container .heateorSssSharing{
							background-color: #000;
							color: #fff;
						border-width: 0px;
			border-style: solid;
			border-color: transparent;
		}
				.heateor_sss_horizontal_sharing .heateorSssTCBackground{
			color:#666;
		}
				.heateor_sss_horizontal_sharing .heateorSssSharing:hover,.heateor_ss

In [21]:
print(soup_tmp.find('p',class_='addy'))
# 가격, 주소, 전화번호, 가게 홈페이지

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>


In [22]:
text_tmp = soup_tmp.find('p', class_='addy').get_text()
text_tmp

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [23]:
text_tmp.split()

['$10.', '2109', 'W.', 'Chicago', 'Ave.,', '773-772-0406,', 'theoldoaktap.com']

In [24]:
# 가격만 출력
text_tmp.split()[0][:-1] # . 제외

'$10'

In [25]:
# 주소만 출력
' '.join(text_tmp.split()[1:-2])

'2109 W. Chicago Ave.,'

## 상태진행 바 추가 (TQDM Module)

In [26]:
# conda install -c conda-forge tqdm

In [27]:
from tqdm import tqdm_notebook

# 가격, 주소, 전화번호 분리
price = []
address = []
phone = []

for n in tqdm_notebook(bs_df.index) :  # 진행상태 표시
    # HTML Error 403 Forbidden 처리
    req = Request(bs_df['URL'][n], headers= hdr)
    html = urlopen(req).read()
    # Error 처리 종료
    soup_tmp = BeautifulSoup(html, 'lxml')
    
    text_gettings = soup_tmp.find('p','addy').get_text()
    
    price.append(text_gettings.split()[0][:-1])
    address.append(' '.join(text_gettings.split()[1:-2]))
    phone.append(text_gettings.split()[-2])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for n in tqdm_notebook(bs_df.index) :  # 진행상태 표시


  0%|          | 0/50 [00:00<?, ?it/s]

In [28]:
# 변수 길이 확인
len(price), len(address), len(phone)

(50, 50, 50)

In [29]:
# bs_df DataFrame에 Price, Address, Phone 데이터 추가
bs_df['Price'] = price
bs_df['Address'] = address
bs_df['Phone'] = phone

bs_df = bs_df.loc[:, ['Rank', 'Store', 'Menu', 'Price', 'Address', 'Phone']]
bs_df.set_index('Rank', inplace = True)  # Rank로 인덱스 설정
bs_df.head(10)

Unnamed: 0_level_0,Store,Menu,Price,Address,Phone
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Old Oak Tap,BLT,$10,"2109 W. Chicago Ave.,","773-772-0406,"
2,Au Cheval,Fried Bologna,$9,"800 W. Randolph St.,","312-929-4580,"
3,Xoco,Woodland Mushroom,$9.50,"445 N. Clark St.,","312-334-3688,"
4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston,","847-475-9400,"
5,Publican Quality Meats,PB&L,$10,"825 W. Fulton Mkt.,","312-445-8977,"
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,$7.25,100 E. Walton,"St.,"
7,Acadia,Lobster Roll,$16,"1639 S. Wabash Ave.,","312-360-9500,"
8,Birchwood Kitchen,Smoked Salmon Salad,$10,"2211 W. North Ave.,","773-276-2100,"
9,Cemitas Puebla,Atomica Cemitas,$9,"3619 W. North Ave.,","773-772-8435,"
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,$17,"3267 S. Halsted St.,","312-929-2486,"


In [30]:
# csv 저장
bs_df.to_csv('./03. best_sandwiches_list_chicago2.csv', encoding='utf-8', sep = ',')

# 맛집 위치 지도 시각화

In [31]:
import folium
import googlemaps as gm
import numpy as np

In [32]:
gmaps_key = 'AIzaSyBRI_3yR_wDQpH8asCO6Reykh54P0B0YiQ'
gmaps = gm.Client(key=gmaps_key)

In [33]:
lat = []
lng = []

for n in tqdm_notebook(bs_df.index):
    if bs_df['Address'][n] != 'Multiple':  # Multiple 제외한 주소 추가
        target_name = bs_df['Address'][n] +', ' + 'Chicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])
        
    else:  # Multiple일 경우 NaN
        lat.append(np.nan)
        lng.append(np.nan)
        
bs_df['lat'] = lat
bs_df['lng'] = lng
bs_df.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for n in tqdm_notebook(bs_df.index):


  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0_level_0,Store,Menu,Price,Address,Phone,lat,lng
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Old Oak Tap,BLT,$10,"2109 W. Chicago Ave.,","773-772-0406,",41.895558,-87.679967
2,Au Cheval,Fried Bologna,$9,"800 W. Randolph St.,","312-929-4580,",41.884639,-87.64759
3,Xoco,Woodland Mushroom,$9.50,"445 N. Clark St.,","312-334-3688,",41.890523,-87.630783
4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston,","847-475-9400,",42.058322,-87.683748
5,Publican Quality Meats,PB&L,$10,"825 W. Fulton Mkt.,","312-445-8977,",41.886604,-87.648536


In [34]:
# 맛집들의 위·경도의 평균값을 지도와 마커의 중앙으로 설정
mapping = folium.Map(location=[bs_df['lat'].mean(), bs_df['lng'].mean()],
                     zoom_start=11)
folium.Marker([bs_df['lat'].mean(), bs_df['lng'].mean()],
              popup= '<pre>Center</pre>').add_to(mapping)

mapping

In [35]:
mapping = folium.Map(location=[bs_df['lat'].mean(), bs_df['lng'].mean()],
                     zoom_start=11)

for n in tqdm_notebook(bs_df.index):
    if bs_df['Address'][n] != 'Multiple':
        folium.Marker([bs_df['lat'][n], bs_df['lng'][n]],
                      popup = bs_df['Store'][n]).add_to(mapping)

mapping

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for n in tqdm_notebook(bs_df.index):


  0%|          | 0/50 [00:00<?, ?it/s]