In [1]:
# import libraries
import urllib.request
from bs4 import BeautifulSoup
import csv
from datascience import *
import numpy as np

Welcome to the Data Scholars Foundations web scraping tutorial (adapted from Towards Data Science [article](https://towardsdatascience.com/data-science-skills-web-scraping-using-python-d1a85ef607ed))! In the example below, we are scraping from http://www.fasttrack.co.uk/league-tables/tech-track-100/league-table/. To view the HTML, navigate to the site, right click somewhere inside the page, and click "View Page Source"

In [7]:
# specify the url
urlpage =  'http://www.fasttrack.co.uk/league-tables/tech-track-100/league-table/' 
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')
# find results within table
table = soup.find('table', attrs={'class': 'tableSorter'})
results = table.find_all('tr')
print('Number of results', len(results))

# create and write headers to a list 
rows = []
rows.append(['Rank', 'Company Name', 'Location', 'Year end', 'Annual sales rise over 3 years', 'Sales £000s', 'Staff', 'Comments'])

# loop over results
for result in results:
    data = result.find_all('td')
    if len(data) == 0:
        continue
    
    rank = data[0].getText()
    company = data[1].getText()
    location = data[2].getText()
    yearend = data[3].getText()
    salesrise = data[4].getText()
    sales = data[5].getText()
    staff = data[6].getText()
    comments = data[7].getText()
    
    rows.append([rank, company, location, yearend, salesrise, staff, comments])    


## Create csv and write rows to output file
with open('techtrack100.csv','w', newline='') as f_output:
    csv_output = csv.writer(f_output)
    csv_output.writerows(rows)
    
tech_track_100 = Table.read_table('techtrack100.csv')
tech_track_100

Number of results 101


Rank,Company Name,Location,Year end,Annual sales rise over 3 years,Sales £000s,Staff,Comments
1,Plan.comCommunications provider,Isle of Man,Sep-17,364.38%,90,About 650 partners use its telecoms platform to support ...,
2,PsiOxusBiotechnology developer,Oxfordshire,Dec-17,311.67%,54,Received a $15m milestone payment from its development p ...,
3,CensorNetCloud security software developer,Basingstoke,Dec-17,210.17%,77,"Has more than 4,000 customers, including McDonald’s in F ...",
4,thoughtonomyAutomation software developer,East London,May-18,205.20%,100,It sells to 28 countries and 50% of revenue is generated ...,
5,PerkboxEmployee engagement services,Central London,Dec-17,204.12%,200,Acquired software platform Loyalty Bay for an undisclose ...,
6,OguryMobile data marketing,North London,Dec-17,204.09%,133,It has access to data from more than 400m mobile phone u ...,
7,VerveMarketing software developer,Central London,Dec-17,202.15%,130,"Its network of 25,000 ‘micro-influencers’ have sold half ...",
8,goHenryChildren's pre-paid card developer,Central London,Dec-17,200.87%,61,The company is named after the first child to make a tra ...,
9,DarktraceCybersecurity developer,Cambridge,Jun-18,195.51%,694,Was reportedly valued at $1.25bn in April when Vitruvian ...,
10,BizumaB2B e-commerce platform,Central London,Mar-18,181.75%,48,Originally founded as a flash sales website before chang ...,


Great! Now let's try scraping another website: http://books.toscrape.com/catalogue/category/books/mystery_3/page-1.html

In [10]:
# specify the url
urlpage =  'http://books.toscrape.com/catalogue/category/books/mystery_3/page-1.html' 

# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')

# create and write headers to a list
rows = []
rows.append(["title", "upc", "prod_type", "price", "price_after_tax", "tax", "availability", "num_reviews"])

# find results
ol = soup.find('ol', attrs={'class': 'row'})
results = ol.find_all('article')
print('Number of results', len(results))

# loop over results
for result in results:

    title = result.find('h3').find('a').get('title')
    
    url_suffix = result.find('a').get('href').split("../")[-1]
    url = "http://books.toscrape.com/catalogue/" + url_suffix
    
    page = urllib.request.urlopen(url)
    
    soup = BeautifulSoup(page, 'html.parser')
    
    data = soup.find('table', attrs={'class': 'table table-striped'}).find_all('td')
    
    if len(data) == 0:
        continue 
        
    upc = data[0].getText()
    prod_type = data[1].getText()
    price = data[2].getText()
    price_after_tax = data[3].getText()
    tax = data[4].getText()
    availability = data[5].getText()
    num_reviews = data[6].getText()
    
    rows.append([upc, prod_type, price, price_after_tax, tax, availability, num_reviews])

    
# Create csv and write rows to output file
with open('books.csv','w', newline='') as f_output:
    csv_output = csv.writer(f_output)
    csv_output.writerows(rows)

# read csv into a table
books = Table.read_table('books.csv')
books

Number of results 20


title,upc,prod_type,price,price_after_tax,tax,availability,num_reviews
e00eb4fd7b871a48,Books,£47.82,£47.82,£0.00,In stock (20 available),0,
19ed25f4641d5efd,Books,£19.63,£19.63,£0.00,In stock (18 available),0,
5ee94540d0749ea0,Books,£56.50,£56.50,£0.00,In stock (16 available),0,
f733e8c19d40ec2e,Books,£16.64,£16.64,£0.00,In stock (16 available),0,
c7b5183f4d1d4efe,Books,£44.10,£44.10,£0.00,In stock (15 available),0,
3bc89353f7e3a3cc,Books,£54.21,£54.21,£0.00,In stock (14 available),0,
0c7b9cf2b7662b65,Books,£13.92,£13.92,£0.00,In stock (14 available),0,
2d1e337aaf341858,Books,£10.69,£10.69,£0.00,In stock (14 available),0,
4416c474713ec1f5,Books,£48.35,£48.35,£0.00,In stock (14 available),0,
63ee5bc46066a8a8,Books,£16.73,£16.73,£0.00,In stock (14 available),0,


Awesome! Now we can analyze the data in the tables just like how we've been doing in Data 8. What are some discoveries you've made? 

In [16]:
tech_track_100.group('Location')

Location,count
Basingstoke,1
Bath,1
Belfast,1
Birmingham,1
Bolton,1
Cambridge,4
Cambridgeshire,1
Central London,36
Cheshire,1
East London,8
