<a href="https://colab.research.google.com/github/Dele2/DelesPlace/blob/master/Web_Scrapping_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load in the necessary libraries

In [2]:
import requests
from bs4 import BeautifulSoup as bs

In [3]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object
soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### find and find_all 

In [4]:
# find
first_header = soup.find("h2")
first_header

<h2>A Header</h2>

In [5]:
# find_all
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [6]:
# Pass in a list of elements to look for
headers = soup.find_all(["h1","h2"])
headers


[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [7]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [8]:
paragraph = soup.find_all("p", attrs = {"id" : "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [9]:
# You can nest find/find_all calls
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [10]:
body = soup.find('body')
div = body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [11]:
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [12]:
# We can search specfic strings in our find/find_all calls
import re

paragraphs = soup.find_all("p", string =  re.compile("Some"))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [13]:
headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (CSS selector)

In [14]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [15]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [16]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [17]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [18]:
paragraphs = soup.select("body > p")
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [19]:
for paragraph in paragraphs:
    print(paragraph.select("i"))

[<i>Some italicized text</i>]
[]


In [20]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properties of the HTML

In [21]:
# use .string
header = soup.find("h2")
header.string

'A Header'

In [22]:
# If multiple child elements use get_text
div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [23]:
# Get a specific property from a element
link = soup.find("a")
link

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [24]:
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [25]:
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

In [26]:
# Path Syntax
soup.body.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [27]:
soup.body.h1

<h1>HTML Webpage</h1>

In [28]:
soup.body.h1.string

'HTML Webpage'

In [29]:
# Know the terms: Parent, Sibling, Child
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Grab all of the social links from the Webpage

Do this in at least 3 different ways

In [30]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
web_page = bs(r.content)


In [31]:
# 1st way using find
ulist = web_page.find("ul", attrs={"class" : "socials"})
links = ulist.find_all("a")
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [32]:
# 2nd way using select
links = web_page.select("ul.socials a")
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [33]:
# 3rd way 
ulinks = web_page.find_all(class_ = "social")
links = ulist.find_all("a")
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [34]:
# 4th way
links = web_page.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrape a Table

In [35]:
import pandas as pd

table = web_page.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
my_list = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [str(tr.get_text()).strip() for tr in td]
    my_list.append(row)

df = pd.DataFrame(my_list, columns = column_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Find all the fun facts that has the word 'is' in it

In [47]:
fun_facts = web_page.select("ul.fun-facts li")
print(fun_facts)
my_list = [fun_fact.get_text() for fun_fact in fun_facts]

is_list = [fact for fact in my_list if "is" in fact]
is_list

[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>, <li>Middle name is Ronald</li>, <li>Never had been on a plane until college</li>, <li>Dunkin Donuts coffee is better than Starbucks</li>, <li>A favorite book series of mine is <i>Ender's Game</i></li>, <li>Current video game of choice is <i>Rocket League</i></li>, <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>]


['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

### Get Secret Message

In [37]:
links = web_page.select("div.block ul li a")
actual_links = [link['href'] for link in links]
actual_links

['challenge/file_1.html',
 'challenge/file_2.html',
 'challenge/file_3.html',
 'challenge/file_4.html',
 'challenge/file_5.html',
 'challenge/file_6.html',
 'challenge/file_7.html',
 'challenge/file_8.html',
 'challenge/file_9.html',
 'challenge/file_10.html']

In [62]:
# Method 1 of finding secret message
# Generate a list of mystery page web addresses
url = "https://keithgalli.github.io/web-scraping/"
message_list = []


for link in actual_links:
    page = requests.get(url + link)
    #Convert to a beautiful soup object
    bs_page = bs(page.content)

    # Find secret word
    s_word = bs_page.select("p#secret-word")
    # Flatten the list of list into a single list
    message_list.extend(s_word)
    # Purge out the text
    message = [message.get_text() for message in message_list]

print(" ".join(message))

Make sure to smash that like button and subscribe !!!


In [70]:
# Method 2 of finding secret message
message = ""
for link in actual_links:
    page = requests.get(url + link)
    #Convert to a beautiful soup object
    bs_page = bs(page.content)

    secret_word_element = bs_page.find("p", attrs={"id": "secret-word"})
    secret_word = secret_word_element.string
    message += " " + secret_word
    
print(message)

 Make sure to smash that like button and subscribe !!!
