<a href="https://colab.research.google.com/github/Delonix7/Data_Science_Essentials/blob/main/Web_Scraping_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Simple HTML scraping

In [268]:
# !pip install bs4 #Installs the package needed for web scraping
# !pip install beautifulsoup4
# to check the version of BeautifulSoup, import bs4 and run bs4.__version__

In [269]:
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs

Few things to note in the SIMPLE_HTML string below, which essentially applies to all HTML documents. h1 = Header, p = paragraph, ul = unordered list, li = list item

In [270]:
SIMPLE_HTML ='''<html>
<head></head>
<body>
<h1>This is a title</h1>
<p class="subtitle">Lorem ipsum dolor sit amet. Consectetur edipiscim elit.</p>
<p>Here's another p without a class</p>
<ul>
  <li>Rolf</li>
  <li>Charlie</li>
  <li>Jen</li>
  <li>Jose</li>
</ul>
</body>
</html>'''

In [271]:
# with open ('simple_html.html') as doc: #if html is save in a file, use this code to parse it to BeautifulSoup
#   soup = bs(doc, 'html.parser')

In [272]:
simple_soup = bs(SIMPLE_HTML, 'html.parser') # first argument is the html file or  string to be parsed.
#second argument is the type of document being parsed.

In [273]:
print(simple_soup.find('h1')) #goes throught the html document and find the first h1 tag
#use find_all() if you want to find all h1 tags

<h1>This is a title</h1>


In [274]:
print(simple_soup.find('h1').string) #adding .string prints out the content of the tag

This is a title


In [275]:
for f in simple_soup.find_all('li'): # to access the contents of tags found with find_all(), loop over the contents
  print(f.string)

Rolf
Charlie
Jen
Jose


### Let's write some functions to search the html document

In [276]:
def find_title(): #searches for the first h1 tag and returns its content as a string
  h1_tag = simple_soup.find('h1')
  return h1_tag.string

In [277]:
find_title()

'This is a title'

In [278]:
def find_list_items(): #searches for all the li tags and returns it as a list of strings
  list_items = simple_soup.find_all('li')
  list_contents = [items.string for items in list_items]
  return list_contents

In [279]:
find_list_items()

['Rolf', 'Charlie', 'Jen', 'Jose']

In [280]:
def find_subtitle(): #searching for a specific tag using a class attribute
  paragraph = simple_soup.find('p', {'class': 'subtitle'}) #pass the class attribute to find as a dictionary
  return paragraph.string

In [281]:
find_subtitle()

'Lorem ipsum dolor sit amet. Consectetur edipiscim elit.'

In [282]:
def find_other_paragraph(): #searches for all paragraphs without the attribute class = 'subtitle'
  paragraphs = simple_soup.find_all('p') #finds all p tags. select() can be used in place of find_all
  other_paragraph = [p.string for p in paragraphs if 'subtitle' not in p.attrs.get('class', [])]
  # creates a list with list comprehension. Loops over the contents of paragraphs and finds the paragraphs
  # with subtitle not in p.attrs.get('class', []) --> the get method returns the value for the key 'class'.
  #if the key 'class' is not found, it returns None. The second arg [] will be returned if value is None.
  return other_paragraph

In [283]:
find_other_paragraph()

["Here's another p without a class"]

## More Complex Scraping

Few things to note in the ITEM_HTML string below, which essentially applies to all HTML documents. 'article' tag seems to contain all the information needed. 
div tag is just used to structure content in html.
'a' is used for links, 'img' is an image tag, 
src is the property of the image tag showing the source of the image, 
'alt' property is the alternate text to show if the image cannot load for some reason.
'i' tag is used for icons


In [284]:
ITEM_HTML = '''<html><head></head><body>
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
    <article class="product_pod">
            <div class="image_container">
                    <a href="catalogue/a-light-in-the-attic_1000/index.html"><img src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" alt="A Light in the Attic" class="thumbnail"></a>
            </div>
            <p class="star-rating Three">
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
            </p>
            <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
            <div class="product_price">
        <p class="price_color">$51.77</p>
<p class="instock availability">
    <i class="icon-ok"></i>
        In stock
</p>
    <form>
        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
    </form>
            </div>
    </article>
</li>
</body></html>
'''

In [285]:
item_html_soup = bs(ITEM_HTML, 'html.parser')

In [286]:
print(item_html_soup.select_one('article.product_pod h3 a').attrs) #prints the dictionary of attributes of the css locator
#Note: the attributes of the tags become the keys of the attrs dictionary

{'href': 'catalogue/a-light-in-the-attic_1000/index.html', 'title': 'A Light in the Attic'}


In [287]:
print(item_html_soup.select_one('article.product_pod h3 a').attrs.items())

dict_items([('href', 'catalogue/a-light-in-the-attic_1000/index.html'), ('title', 'A Light in the Attic')])


In [288]:
print(item_html_soup.select_one('article.product_pod h3 a').attrs['href'])

catalogue/a-light-in-the-attic_1000/index.html


In [289]:
new_tag = item_html_soup.select('article div.image_container a img')
print(new_tag[0].attrs.keys())

dict_keys(['src', 'alt', 'class'])


In [290]:
def find_item_name():
  locator = 'article.product_pod h3 a' #CSS locator
  item_link = item_html_soup.select_one(locator) #select
  print(type(item_link))
  item_name = item_link.attrs['title']
  return item_name


In [291]:
find_item_name()

<class 'bs4.element.Tag'>


'A Light in the Attic'

In [292]:
def find_item_link():
  locator = 'article.product_pod h3 a' #CSS locator
  item_link = item_html_soup.select_one(locator) .attrs['href']
  return item_link

In [293]:
find_item_link()

'catalogue/a-light-in-the-attic_1000/index.html'

In [294]:
def find_item_price(): 
  price = item_html_soup.select_one('article.product_pod p.price_color').string
  price = float(price.strip('$'))
  print(f'Discounted price = {price * 0.8}')
  return f'Actual price = {price}'

In [295]:
find_item_price()

Discounted price = 41.416000000000004


'Actual price = 51.77'

In [296]:
# other_paragraph = [p.string for p in paragraphs if 'subtitle' not in p.attrs.get('class', [])]

In [297]:
locator = 'article.product_pod p.star-rating'
star_rating_tag = item_html_soup.select_one(locator)
classes = star_rating_tag.attrs['class']
rating_classes = [c for c in classes if c != 'star-rating']
rating_classes

['Three']

In [298]:
type(star_rating_tag)

bs4.element.Tag

##Creating a class for scraping Example

In [299]:
class ParsedItemLocators:
  '''
  Locators for an item in the HTML page.

  This allows us to easily see what our code will be looking at as well as change it quickly if we
  notice it is now different.
  '''
  NAME_LOCATOR = 'article.product_pod h3 a'
  LINK_LOCATOR = 'article.product_pod h3 a'
  PRICE_LOCATOR = 'article.product_pod p.price_color'
  RATING_LOCATOR = 'article.product_pod p.star-rating'

class ParsedItem:
  '''
  A class to take in an HTML page or part of it and find properties of an item in it.
  '''

  def __init__(self, page):
    self.item_html_soup = bs(page, 'html.parser')
    
  def price(self): 
    locator = ParsedItemLocators.PRICE_LOCATOR
    price = self.item_html_soup.select_one(locator).string
    price = float(price.strip('$'))
    print(f'Discounted price = {price * 0.8}')
    return f'Actual price = {price}'

  def link(self):
    locator = ParsedItemLocators.LINK_LOCATOR #CSS locator
    item_link = self.item_html_soup.select_one(locator) .attrs['href']
    return item_link

  def name(self):
    locator = ParsedItemLocators.NAME_LOCATOR #CSS locator
    item_link = self.item_html_soup.select_one(locator) #select
    item_name = item_link.attrs['title']
    return item_name

  def rating(self):
    locator = ParsedItemLocators.RATING_LOCATOR
    star_rating_tag = self.item_html_soup.select_one(locator)
    classes = star_rating_tag.attrs['class']
    rating_classes = [c for c in classes if c != 'star-rating']
    return rating_classes[0]

In [300]:
item = ParsedItem(ITEM_HTML)

In [301]:
print(item.price())

Discounted price = 41.416000000000004
Actual price = 51.77


##Using Requests to get webpage content

In [302]:
page = requests.get('http://www.example.com')

In [303]:
page

<Response [200]>

In [304]:
soup = bs(page.content, 'html.parser')

In [305]:
headers = soup.select('h1')

In [306]:
type(headers)

list

## Scraping Wikipedia

In [307]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives'
table_id = 'votingmembers'

response = requests.get(wiki_url)
soup = bs(response.text, 'html.parser')

In [308]:
congress_table = soup.find('table', attrs= {'id' : table_id})

In [309]:
df = pd.read_html(str(congress_table))

##Scraping 100 meters track event from Wikipedia

In [310]:
url = 'https://en.wikipedia.org/wiki/100_metres'
response = requests.get(url).content
soup_100 = bs(response, 'html.parser')

In [311]:
locators  = 'div.mw-parser-output table.wikitable'

In [312]:
table = soup_100.select_one(locators)

In [313]:
df = pd.read_html(str(table))

In [314]:
df = df[0]

In [315]:
df

Unnamed: 0_level_0,Area,Men,Men,Men,Men,Women,Women,Women,Women
Unnamed: 0_level_1,Area,Time (s),Wind (m/s),Athlete,Nation,Time (s),Wind (m/s),Athlete,Nation
0,Africa (records),9.77[A],1.2,Ferdinand Omanyala,Kenya,10.72,+1.4,Marie-Josée Ta Lou,Ivory Coast
1,Asia (records),9.83,0.9,Su Bingtian,China,10.79,0.0,Li Xuemei,China
2,Europe (records),9.80,0.1,Marcell Jacobs,Italy,10.73,+2.0,Christine Arron,France
3,"North, Central America and Caribbean (records)",9.58 WR,0.9,Usain Bolt,Jamaica,10.49 WR,0.0[a],Florence Griffith-Joyner,United States
4,Oceania (records),9.93,1.8,Patrick Johnson,Australia,11.08,+0.7,Zoe Hobbs,New Zealand
5,South America (records),10.00[A],1.6,Robson da Silva,Brazil,10.91,−0.2,Rosângela Santos,Brazil


In [316]:
locator2 = 'div.mw-parser-output table.wikitable.sortable'

In [317]:
table_2 = soup_100.select_one(locator2)

In [318]:
df2 = pd.read_html(str(table_2))

In [319]:
df2 = df2[0]

In [320]:
df2.columns = ['athlete_num', 'performance', 'time(s)', 'wind(m/s)', 'athlete', 'nation', 'date', 'place', 'ref']

In [321]:
df2.drop(['athlete_num', 'ref'], axis = 1, inplace = True)

In [322]:
df2.loc[df2['athlete'].str.contains('Bolt'), 'athlete'] = 'Usain Bolt'
df2.loc[df2['athlete'].str.contains('Gay'), 'athlete'] = 'Tyson Gay'
df2.loc[df2['athlete'].str.contains('Blake'), 'athlete'] = 'Yohan Blake'
df2.loc[df2['athlete'].str.contains('Gatlin'), 'athlete'] = 'Justin Gatlin'
df2.loc[df2['athlete'].str.contains('Powell'), 'athlete'] = 'Asafa Powell'
df2.loc[df2['athlete'].str.contains('Bromell'), 'athlete'] = 'Trayvon Bromell'
df2.loc[df2['athlete'].str.contains('Kerley'), 'athlete'] = 'Fred Kerley'

In [323]:
df2_no_nan = df2.dropna(subset = ['nation'], axis = 0)

In [324]:
athletes = list(df2.athlete.unique())

In [325]:
at_nation = dict(list(zip(df2_no_nan.athlete, df2_no_nan.nation)))

In [326]:
df2['nation'] = df2['athlete'].map(at_nation)

In [327]:
df2.shape

(59, 7)

In [328]:
df2

Unnamed: 0,performance,time(s),wind(m/s),athlete,nation,date,place
0,1.0,9.58,+0.9,Usain Bolt,Jamaica,16 AUG 2009,Berlin
1,2.0,9.63,+1.5,Usain Bolt,Jamaica,05 AUG 2012,London
2,3.0,9.69,±0.0,Usain Bolt,Jamaica,16 AUG 2008,Beijing
3,3.0,9.69,+2.0,Tyson Gay,United States,20 SEP 2009,Shanghai
4,3.0,9.69,−0.1,Yohan Blake,Jamaica,23 AUG 2012,Lausanne
5,6.0,9.71,+0.9,Tyson Gay,United States,16 AUG 2009,Berlin
6,7.0,9.72,+1.7,Usain Bolt,Jamaica,31 MAY 2008,New York City
7,7.0,9.72,+0.2,Asafa Powell,Jamaica,02 SEP 2008,Lausanne
8,9.0,9.74,+1.7,Asafa Powell,Jamaica,09 SEP 2007,Rieti
9,9.0,9.74,+0.9,Justin Gatlin,United States,15 MAY 2015,Doha


##Extracting EPL related information from Wikipedia

In [329]:
url = 'https://en.wikipedia.org/wiki/2021%E2%80%9322_Premier_League'
page = requests.get(url).content

soup_epl = BeautifulSoup(page, 'html.parser')


In [330]:
html_table = soup_epl.find_all('table', attrs={'class' : 'wikitable'})

In [331]:
# html_table = soup_epl.select('table.wikitable')

In [332]:
type(html_table)

bs4.element.ResultSet

In [333]:
df_epl = pd.read_html(str(html_table))

In [334]:
len(df_epl)

12

In [335]:
d = {}

In [336]:
for i in range(0,12):
 d[f'df_epl_{i}'] = df_epl[i]

In [337]:
d.keys()

dict_keys(['df_epl_0', 'df_epl_1', 'df_epl_2', 'df_epl_3', 'df_epl_4', 'df_epl_5', 'df_epl_6', 'df_epl_7', 'df_epl_8', 'df_epl_9', 'df_epl_10', 'df_epl_11'])

In [338]:
d['df_epl_8']

Unnamed: 0,Rank,Player,Club,Clean sheets[155]
0,1,Alisson,Liverpool,20
1,1,Ederson,Manchester City,20
2,3,Hugo Lloris,Tottenham Hotspur,16
3,4,Édouard Mendy,Chelsea,14
4,5,Aaron Ramsdale,Arsenal,12
5,6,Vicente Guaita,Crystal Palace,11
6,6,Emiliano Martínez,Aston Villa,11
7,6,José Sá,Wolverhampton Wanderers,11
8,6,Robert Sánchez,Brighton & Hove Albion,11
9,10,Nick Pope,Burnley,9


##Scraping EPL site

In [339]:
def epl_tables(url, data_compseason):
  url = url
  page = requests.get(url).content
  soup_epl2 = BeautifulSoup(page, 'html.parser')
  table = soup_epl2.find_all('tr', attrs= {'data-compseason': data_compseason}) #finding all tr tags with data-compseason = 489

  position, club = ([] for i in range(2))
  for i in range(len(table)):
    club.append(table[i]['data-filtered-table-row-name'])
    position.append(table[i]['data-position'])

  epl_table = pd.DataFrame()
  played, won, drawn, lost, gf, ga, gd, points = ([] for i in range(8))

  for i in range(20):
    try:
      played.append(table[i].find_all('td')[3].string)
      won.append(table[i].find_all('td')[4].string)
      drawn.append(table[i].find_all('td')[5].string)
      lost.append(table[i].find_all('td')[6].string)
      gf.append(table[i].find_all('td')[7].string)
      ga.append(table[i].find_all('td')[8].string)
      gd.append(table[i].find_all('td')[9].string)
      points.append(table[i].find_all('td')[10].string)
    
    except IndexError as e:
      return (f'An error occured {e}')

  lists = [position, club, played, won, drawn, lost, gf, ga, gd, points]
  epl_table = pd.concat([pd.Series(x) for x in lists], axis = 1, keys = ['position', 'club', 'played', 'won', 'drawn', 'lost', 'gf', 'ga', 'gd', 'points'])
  epl_table['gf'] = epl_table['gf'].astype(int)
  epl_table['ga'] = epl_table['ga'].astype(int)
  epl_table['gd'] = epl_table['gf'] - epl_table['ga']
  return epl_table

In [340]:
url = 'https://www.premierleague.com/tables?team=FIRST'

In [341]:
epl_tables(url, 489)

Unnamed: 0,position,club,played,won,drawn,lost,gf,ga,gd,points
0,1,Arsenal,7,6,0,1,17,7,10,18
1,2,Manchester City,7,5,2,0,23,6,17,17
2,3,Tottenham Hotspur,7,5,2,0,18,7,11,17
3,4,Brighton and Hove Albion,6,4,1,1,11,5,6,13
4,5,Manchester United,6,4,0,2,8,8,0,12
5,6,Fulham,7,3,2,2,12,11,1,11
6,7,Chelsea,6,3,1,2,8,9,-1,10
7,8,Liverpool,6,2,3,1,15,6,9,9
8,9,Brentford,7,2,3,2,15,12,3,9
9,10,Newcastle United,7,1,5,1,8,7,1,8
