In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
source = requests.get('https://keithgalli.github.io/web-scraping/example.html')

soup = BeautifulSoup(source.content,'lxml')
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [3]:
first_header = soup.find('h2').text
print(first_header)

A Header


In [4]:
headers = soup.find_all('h2')
print(headers)


[<h2>A Header</h2>, <h2>Another header</h2>]


### Implementing attributes

In [5]:
paragraph = soup.find_all('p', attrs={'id':'paragraph-id'})
print(paragraph)


[<p id="paragraph-id"><b>Some bold text</b></p>]


### Nesting find, find_all

In [6]:
body = soup.find('body')
div = body.find('div')
print(div)

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>


### Seaching specific strings in find, find_all

In [7]:
import re

paragraphs = soup.find_all('p',string=re.compile('Some'))
paragraphs

headers = soup.find_all('h2',string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (CSS selector)

In [8]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [9]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [10]:
paragraphs = soup.select('h2 ~p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [12]:
paragraphs = soup.select('body > p')
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


### Getting different properties of html

In [13]:
soup.find('h2').text


'A Header'

In [14]:
div = soup.find('div')
print(div.prettify())

print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [15]:
# Getting specific property from a element
link = soup.find('a')
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

### Code navigation

In [16]:
# Know the terms: Parent, siblings, child
soup.body
soup.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Exercise 

In [17]:
source = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

webpage = BeautifulSoup(source.content,'lxml')
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

#### Grabbing all social links

In [18]:
# Method 1

ul = webpage.find('ul',class_='socials')

ul.find_all('li')

for link in ul.find_all('a'):
    print(link['href'])
    print()

https://www.instagram.com/keithgalli/

https://twitter.com/keithgalli

https://www.linkedin.com/in/keithgalli/

https://www.tiktok.com/@keithgalli



In [19]:
# Method 2

ul = webpage.find('ul',class_='socials')
# print(ul)

for link in ul.select('a'):
    print(link['href'])
    print()

https://www.instagram.com/keithgalli/

https://twitter.com/keithgalli

https://www.linkedin.com/in/keithgalli/

https://www.tiktok.com/@keithgalli



#### Scraping html table

In [20]:
import pandas as pd


table = webpage.find('table')
df = pd.read_html(str(table))[0]
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 9,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [21]:
df['Team']

0    MIT (Mass. Inst. of Tech.)
1    MIT (Mass. Inst. of Tech.)
2    MIT (Mass. Inst. of Tech.)
3                  Did not play
4    MIT (Mass. Inst. of Tech.)
Name: Team, dtype: object

#### All fun facts that use words 'is'

In [22]:
import re

for facts in webpage.select('ul.fun-facts li'):
    print(facts.find(string=re.compile('is')))

None
Middle name is Ronald
None
Dunkin Donuts coffee is better than Starbucks
A favorite book series of mine is 
Current video game of choice is 
The band that I've seen the most times live is the 


#### Dowloading images

In [23]:

url = 'https://keithgalli.github.io/web-scraping/'

source = requests.get(url+"webpage.html")    

images = webpage.select('div.row div.column img')
image_url = images[0]['src']
full_url = url+image_url

img_data = requests.get(full_url).content
with open('lake_como.jpeg','wb') as handler:
    handler.write(img_data)

#### Mystery Challenge

In [45]:
url = 'https://keithgalli.github.io/web-scraping/'

# source = requests.get(url+"file_link")

n = range(1,11)

file =  webpage.find("div",class_='block')
fil = file.find('a')
file_link = fil['href']

file_full_url = url+file_link
file_full_url

'https://keithgalli.github.io/web-scraping/challenge/file_1.html'