In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
# Download the page and convert it to beautiful soup object
nfl_url = "https://sports.yahoo.com/nfl/stats/"
res = requests.get(nfl_url)
soup = BeautifulSoup(res.content, 'html.parser')
# or soup = BeautifulSoup(res.content, 'lxml') if you have lxml installed

In [3]:
# BeautifulSoup creates a tree-like structure 
# Here are a few ways to navigate the DOM
title_tag = soup.head.title
print("Title tag: {}".format(title_tag))

Title tag: <title>NFL on Yahoo! Sports - News, Scores, Standings, Rumors, Fantasy Games</title>


In [4]:
title_tag_text = soup.head.title.get_text()
print("Title Tag Text: {}".format(title_tag_text))

Title Tag Text: NFL on Yahoo! Sports - News, Scores, Standings, Rumors, Fantasy Games


In [5]:
# Find all the paragaph elements (<b>)
soup.find_all('b')

[<b class="P(4px)" data-reactid="19">Menu</b>,
 <b class="Hidden" data-reactid="22">Yahoo</b>,
 <b class="D(ib) Mt(10px) Fz(10px) Tt(u) W(107px) C(#fff)" data-reactid="23" id="uh-logo-site">Sports</b>,
 <b class="P(4px)" data-reactid="28">Mail</b>,
 <b class="ModalShim" data-reactid="1844"></b>,
 <b class="IEShim" data-reactid="1845"></b>]

In [6]:
# Get urls in page via the 'a' tag and filter for nfl/players in urls
nfl_players_url = []
for link in soup.find_all('a'):
    link_url = link.get('href')
    # discard "None"
    if link_url:
        if "nfl/players" in link_url:
            print(link_url)
            nfl_players_url.append(link_url)

https://sports.yahoo.com/nfl/players/5479/
https://sports.yahoo.com/nfl/players/6763/
https://sports.yahoo.com/nfl/players/7426/
https://sports.yahoo.com/nfl/players/6770/
https://sports.yahoo.com/nfl/players/26483/
https://sports.yahoo.com/nfl/players/30218/
https://sports.yahoo.com/nfl/players/28398/
https://sports.yahoo.com/nfl/players/30972/
https://sports.yahoo.com/nfl/players/28014/
https://sports.yahoo.com/nfl/players/25807/
https://sports.yahoo.com/nfl/players/9353/
https://sports.yahoo.com/nfl/players/29281/
https://sports.yahoo.com/nfl/players/29399/
https://sports.yahoo.com/nfl/players/24793/
https://sports.yahoo.com/nfl/players/27535/
https://sports.yahoo.com/nfl/players/27647/
https://sports.yahoo.com/nfl/players/25719/
https://sports.yahoo.com/nfl/players/29718/
https://sports.yahoo.com/nfl/players/31040/
https://sports.yahoo.com/nfl/players/9072/
https://sports.yahoo.com/nfl/players/6770/
https://sports.yahoo.com/nfl/players/30123/
https://sports.yahoo.com/nfl/players/87

In [7]:
# Look at a single link
one_url = nfl_players_url[0]
res_one_url = requests.get(one_url)
soup_one_url = BeautifulSoup(res_one_url.content, 'html.parser')

In [8]:
# Find the line with Birth Place
lines = soup_one_url.text
res2 = lines.split(",")
key_line = []
for line in res2:
    if "Birth" in line:
        print(line)
        key_line.append(line)

 New Orleans Yds3992Y/G266.1TD32QBRat115.7Height: 6' 0"Weight: 209Born: College: PurdueBirth Place: Austin


In [9]:
# Extract birthplace
birth_place = key_line[0].split(":")[-1].strip()
print(birth_place)

Austin


In [10]:
def find_birthplaces(urls):
    """Get the Birthplaces"""
    for url in urls:
        profile = requests.get(url)
        profile_url = BeautifulSoup(profile.content, "html.parser")
        lines = profile_url.text
        res2 = lines.split(",")
        key_line = []
        for line in res2:
            if "Birth" in line:
                # print (line)
                key_line.append(line)
        birth_place = None
        try:
            birth_place = key_line[0].split("*")[-1].strip()
        except IndexError:
            print(f"skipping {url}")
        print(birth_place)
        
find_birthplaces(nfl_players_url)

New Orleans Yds3992Y/G266.1TD32QBRat115.7Height: 6' 0"Weight: 209Born: College: PurdueBirth Place: Austin
LA Chargers Yds4308Y/G269.3TD32QBRat105.5Height: 6' 5"Weight: 228Born: College: North Carolina StateBirth Place: Decatur
Tampa Bay Yds2366Y/G295.8TD17QBRat100.4Height: 6' 2"Weight: 223Born: College: HarvardBirth Place: Gilbert
Pittsburgh Yds5129Y/G320.6TD34QBRat96.5Height: 6' 5"Weight: 240Born: College: Miami (OH)Birth Place: Findlay
Denver Yds3890Y/G243.1TD18QBRat81.2Height: 6' 1"Weight: 215Born: College: HoustonBirth Place: Brownwood
Pittsburgh Yds973Y/G74.8TD12Height: 6' 1"Weight: 233Born: College: PittsburghBirth Place: Erie
LA Rams Yds1251Y/G89.4TD17Height: 6' 1"Weight: 224Born: College: GeorgiaBirth Place: Baltimore
NY Giants Yds1307Y/G81.7TD11Height: 6' 0"Weight: 233Born: College: Penn StateBirth Place: Bronx
NY Jets Yds685Y/G52.7TD6Height: 5' 11"Weight: 225Born: College: Alabama StateBirth Place: Columbus
Houston Yds973Y/G69.5TD5Height: 5' 10"Weight: 221Born: College: Miami

In [11]:
class HTMLTable:
    """A class that will parse the first HTML Table"""
    def __init__(self, html, html_parser='html5lib'):
        self.html    = html
        self.soup    = BeautifulSoup(html, html_parser)
        self.table   = self.soup.find('table')
        self.headers = None
        self.data    = None
        
    def get_headers(self):
        """Returns the values in the thead element of the table"""
        if self.headers:
            return self.headers
        self.headers = [hdr.get_text() for hdr in self.table.thead.select('tr th')]
        return self.headers
    
    def get_data(self):
        """Returns the values in the tbody element of the table"""
        if self.data:
            return self.data
        data = []
        for tr in self.table.tbody.find_all("tr"):
            data.append([td.get_text().replace(" ", "").replace("\n", "") for td in tr.select('td')])
        self.data = data
        return self.data
    
    def write_csv(self, filename):
        """Write a csv of the parsed html table"""
        import csv
        print("Writing the table to {}".format(filename))
        with open(filename, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(self.get_headers()) # First write out the headers
            writer.writerows(self.get_data())  # Then write out all the data
        print("Done!")
        
    def to_dataframe(self):
        """Returns a pandas dataframe of the table"""
        import pandas as pd
        return pd.DataFrame(self.get_data(), columns=self.get_headers())

In [12]:
# Another example
market_url = 'https://www.investing.com/crypto/currencies'
# Set a custom user-agent header
HEADERS = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0)\
                          AppleWebKit/537.36 (KHTML, like Gecko)\
                          Chrome/66.0.3359.139 Safari/537.36'}
response = requests.get(market_url, headers=HEADERS)
# Check that we got an 'ok' status
response.raise_for_status()
# Parse the content into a string (watch out for unicode!)
raw_html = response.content.decode("utf-8")

table = HTMLTable(raw_html)

In [13]:
table.get_headers()

['#',
 '\xa0',
 'Name ',
 'Symbol',
 'Price (USD)',
 ' Market Cap',
 'Vol (24H)',
 'Total Vol',
 'Chg (24H)',
 'Chg (7D)']

In [14]:
table.get_data()[:4]

[['1',
  '',
  'Bitcoin',
  'BTC',
  '3,509.8',
  '$62.32B',
  '$4.62B',
  '31.81%',
  '-2.38%',
  '-12.47%'],
 ['2',
  '',
  'XRP',
  'XRP',
  '0.3178',
  '$13.26B',
  '$486.15M',
  '3.35%',
  '-2.28%',
  '-12.17%'],
 ['3',
  '',
  'Ethereum',
  'ETH',
  '116.6',
  '$12.35B',
  '$2.28B',
  '15.68%',
  '-5.34%',
  '-24.62%'],
 ['4',
  '',
  'BitcoinCash',
  'BCH',
  '124.6',
  '$2.23B',
  '$177.84M',
  '1.23%',
  '-5.68%',
  '-24.04%']]

In [15]:
%%bash
TODAY=$(date +%Y%m%d)
head Currencies${TODAY}.csv | column -t -s,

#               Name   Symbol    Price (USD)   Market Cap  Vol (24H)  Total Vol  Chg (24H)  Chg (7D)
1  Bitcoin      BTC    "3        483.7"       $62.07B      $4.64B     32.07%     -3.26%     -12.77%
2  XRP          XRP    0.31182   $13.03B      $442.98M     3.06%      -4.68%     -13.66%
3  Ethereum     ETH    114.42    $12.19B      $2.26B       15.60%     -7.67%     -25.83%
4  BitcoinCash  BCH    121.53    $2.18B       $178.56M     1.23%      -8.75%     -25.58%
5  EOS          EOS    2.2086    $2.04B       $653.51M     4.52%      -7.46%     -21.46%
6  Stellar      XLM    0.10097   $1.98B       $120.26M     0.83%      -3.35%     -14.17%
7  Tether       USDT   1.00891   $1.97B       $3.35B       23.17%     -0.03%     -0.10%
8  Litecoin     LTC    29.608    $1.82B       $559.51M     3.87%      -6.89%     -22.57%
9  TRON         TRX    0.020759  $1.42B       $170.72M     1.18%      -11.38%    -7.93%


In [16]:
df = table.to_dataframe()
df.head()

Unnamed: 0,#,Unnamed: 2,Name,Symbol,Price (USD),Market Cap,Vol (24H),Total Vol,Chg (24H),Chg (7D)
0,1,,Bitcoin,BTC,3509.8,$62.32B,$4.62B,31.81%,-2.38%,-12.47%
1,2,,XRP,XRP,0.3178,$13.26B,$486.15M,3.35%,-2.28%,-12.17%
2,3,,Ethereum,ETH,116.6,$12.35B,$2.28B,15.68%,-5.34%,-24.62%
3,4,,BitcoinCash,BCH,124.6,$2.23B,$177.84M,1.23%,-5.68%,-24.04%
4,5,,EOS,EOS,2.2534,$2.07B,$662.83M,4.57%,-5.40%,-20.09%
