In [39]:
# dependencies
from splinter import Browser 
from bs4 import BeautifulSoup

# nowadays pages do not have static HTML, therefore we have dynamic HTML 
# this means javascript (or something) is pushing elements to your page which means you have to wait
# for a little bit for the elements to load, splinter helps us deal with this

In [2]:
# load up the browser, this returns the path of the chromedriver
# I didn't use this though, it didn't work, so I had to add the chromedriver.exe to the same folder as the jupyter notebook
# and then point it to 'chromedriver.exe' in the executable path below
!which chromedriver

/c/Web Drivers/chromedriver


In [4]:
# point the path to the chromedriver, note... you get the path after running !which chromedriver above
executable_path = {'executable_path': 'chromedriver.exe'}

# set up the browser, specify chrome, the double asterisk are kwargs (key word arguments)
browser = Browser('chrome', **executable_path, headless=False)
# if this results in a blank screen, go to the chrome shortcut, compatability, select run as Vista Service Pack 2

### NASA Mars News

In [20]:
# Step 1
# scrape the NASA Mars News site, collect the latest News Title and Paragraph text
# assign the text to variables to reference later


In [56]:
# specify the address
url = 'https://mars.nasa.gov/news/'

# open the site in the browser
browser.visit(url)

# since we are looking in the ul.item list and li.slide, we want to go there but delay before scraping
# you could also do a try/except logic to test for the presence of the element and retry or give up depending on result
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=2)

True

In [57]:
# use splinter to parse the html
html = browser.html

# put the html into beautifulsoup
news_soup = BeautifulSoup(html, 'html.parser')

In [58]:
# get the slide element using beautifulsoup
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [59]:
# see what we've captured
slide_elem

<li class="slide"><div class="image_and_description_container"><a href="/news/8458/a-rover-pit-stop-at-jpl/" target="_self"><div class="rollover_description"><div class="rollover_description_inner">Working like a finely honed machine, a team of engineers in this time-lapse video clip install test wheels on another finely honed machine: NASA's Mars 2020 rover.</div><div class="overlay_arrow"><img alt="More" src="/assets/overlay-arrow.png"/></div></div><div class="list_image"><img alt="A team of engineers at NASA's Jet Propulsion Laboratory in Pasadena, California, install the legs and wheels — otherwise known as the mobility suspension — on the Mars 2020 rover." src="/system/news_items/list_view_images/8458_rover-time-lapse-th.jpg"/></div><div class="bottom_gradient"><div><h3>A Rover Pit Stop at JPL</h3></div></div></a><div class="list_text"><div class="list_date">July 12, 2019</div><div class="content_title"><a href="/news/8458/a-rover-pit-stop-at-jpl/" target="_self">A Rover Pit Stop 

In [60]:
# parse through the element above
slide_elem.find('div', class_="content_title")

<div class="content_title"><a href="/news/8458/a-rover-pit-stop-at-jpl/" target="_self">A Rover Pit Stop at JPL</a></div>

In [26]:
# get the news title
news_title = slide_elem.find('div', class_="content_title").get_text()
# print the title
news_title

'A Rover Pit Stop at JPL'

In [27]:
# pull the paragraph
news_p = slide_elem.find('div', class_="article_teaser_body").text
news_p

"Working like a finely honed machine, a team of engineers in this time-lapse video clip install test wheels on another finely honed machine: NASA's Mars 2020 rover."

# JPL Space Images Featured Image

In [28]:
# visit the url for JPL featured space images
# use splinter to navigate the site and find the image url for the current featured Mars image
# assign the url string to a variable called featured_page_url
# make sure to find the image url to the full size .jpg image
# make sure to save a complete url string for this image


In [29]:
# get the url, using the same session as above
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
# visit the page
browser.visit(url)

In [6]:
# click the first link
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [31]:
browser.is_element_present_by_text('more info     ', wait_time=2)
# it turns out to get the photo we need to click the 'more info' button
more_info_elem = browser.find_link_by_partial_text('more info')

In [7]:
# click the 'more info' button
more_info_elem.click()

In [None]:
html = browser.html
img_soup = BeautifulSoup(html, 'html.parser')

In [None]:
# get the image by drilling down into the class figure.lede, then
# an anchor (a) and then the image (img)
# get("src") is the source link of the photo

img_url_rel_path = img_soup.select_one('figure.lede a img').get("src")
# show the path
img_url_rel_path

In [None]:
# make an img url by appending the beginning to it
# another way to do this would be to open the image in a new window
# and get the entire address that way without feeding in the relative path
img_url = f'https://www.jpl.nasa.gov{img_url_rel_path}'
img_url

## Visit the Mars Twitter page, scrape the latest weather tweet

In [None]:
# visit the Mars twitter account and scrape the latest Mars weather tweet from the page
# save the tweet text for the weather report as a variable called mars_weather
url = 'https://twitter.com/marswxreport?lang=en'
# visit the page
browser.visit(url)

In [None]:
# get HTML from page
html = browser.html
tweet_soup = BeautifulSoup(html, 'html.parser')

# verify results
#tweet_soup

In [None]:
# parse through the results
tweet_soup.find(class_="TweetTextSize")

In [None]:
# get the tweet's weather report, store it to a variable
mars_weather = tweet_soup.find(class_="TweetTextSize").get_text()
# print the tweet to confirm
mars_weather

## Visit the Mars Facts webpage, use Pandas to scrape the planet stats

In [None]:
# visit the Mars Facts webpage and use Pandas to scrape the table containing facts about the planet (mass, diameter, etc.)
# use pandas to convert the data to a HTML table string

# get dependency, normally this would be at the top
import pandas as pd

# get the url
url = 'https://space-facts.com/mars/'

# use the read_html function in pandas to scrape tabular data
mars_tables = pd.read_html(url)

#see what we get
mars_tables

In [None]:
# check to see what kind of pandas object it is
type(mars_tables)

In [None]:
# Mars has data in the first and second table

# pull in the first table in the list (item 0)
mars_tbl1 = mars_tables[0]

# pull in the columns
mars_tbl1.columns=['Comparision', 'Mars', 'Earth']

# see what it looks like
mars_tbl1

In [None]:
# generate an HTML table from the dataframe

mars_tbl1_to_html_tbl = mars_tbl1.to_html()
mars_tbl1_to_html_tbl

In [None]:
# strip out unwanted newlines from the code to clean up the table

final_mars_tbl1 = mars_tbl1_to_html_tbl.replace('\n', '')
final_mars_tbl1

In [None]:
# save first table to an HTML file (optional)

# final_mars_tbl1.to_html('mars_table1.html')

In [None]:
# Mars also has data in a second table

# pull in the second table in the list (item 1)
mars_tbl2 = mars_tables[1]

# pull in the columns
mars_tbl2.columns=['0', '1']
mars_tbl2

In [None]:
# generate an HTML table from the dataframe

mars_tbl2_to_html_tbl = mars_tbl2.to_html()
mars_tbl2_to_html_tbl

In [None]:
# strip out unwanted newlines from the code to clean up the table

final_mars_tbl2 = mars_tbl2_to_html_tbl.replace('\n', '')
final_mars_tbl2

In [None]:
# save second table to an HTML file (optional)

# final_mars_tbl2.to_html('mars_table2.html')

## Mars Hemispheres, visit the USGS astrogeology site, get high res images for the hemispheres

In [None]:
# visit the USGS astrogeology site to obtain high res images for each of Mars' hemispheres
# save both the image & url for the full res hemisphere image and the hemisphere title containing the hemisphere name
# use a python dictionary to store the data using the keys img_url and title

# append the dictionary with the image url string and the hemisphere title to a list, the list will contain one
# dictionary for each hemisphere


In [40]:
# get the url
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# visit the page
browser.visit(url)

In [41]:
# click on the links
html = browser.html
hemi_soup = BeautifulSoup(html, 'html.parser')

# view the html
#hemi_soup

In [42]:
# find all the links
hemi_links = hemi_soup.find_all("a", class_="itemLink")
hemi_links

[<a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/7677c0a006b83871b5a2f66985ab5857_schiaparelli_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><h3>Schiaparelli Hemisphere Enhanced</h3></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/syrtis_major_enhanced"><img alt="Syrtis Major Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/aae41197e40d6d4f3ea557f8cfe51d15_syrtis_major_enhanced.tif_thumb.png"/></a>,

In [67]:
# create empty dictionary to store the image url and title
hemisphere_image_urls = {}
# create empty list 
hi_list = []

In [68]:
# get name of hemisphere #1
h3_1 = browser.find_by_tag('h3')[0]
h3_1 = h3_1.value

# click the link #1
browser.click_link_by_partial_text('cerberus_enhanced')

In [69]:
# click the 'original' link to download high resolution image #1
browser.click_link_by_partial_text('cerberus_enhanced.tif')

# grab the url
image_1_url = browser.url
image_1_url

'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'

In [70]:
# append title and image path to dictionary
hemisphere_image_urls = {"title": h3_1, 
                         "img_url": image_1_url}
get_hemi_item = hemisphere_image_urls.items()
# add to list
hi_list.append(get_hemi_item)
hi_list

[dict_items([('title', 'Cerberus Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced')])]

In [71]:
# navigate back to the previous page
browser.back()

# get name of hemisphere #2
h3_2 = browser.find_by_tag('h3')[1]
h3_2 = h3_2.value

# click the link #2
browser.click_link_by_partial_text('schiaparelli_enhanced')

In [72]:
# click the 'original' link to download the high resolution image #2
browser.click_link_by_partial_text('schiaparelli_enhanced.tif')

# grab the url
image_2_url = browser.url
image_2_url

'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced'

In [73]:
# append title and image path to dictionary
hemisphere_image_urls = {"title": h3_2, 
                         "img_url": image_2_url
                        }
get_hemi_item = hemisphere_image_urls.items()
# add to list
hi_list.append(get_hemi_item)
hi_list

[dict_items([('title', 'Cerberus Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced')]),
 dict_items([('title', 'Schiaparelli Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced')])]

In [74]:
# navigate back to the previous page
browser.back()
# click the link #3

# get name of hemisphere #3
h3_3 = browser.find_by_tag('h3')[2]
h3_3 = h3_3.value

browser.click_link_by_partial_text('syrtis_major_enhanced')

In [75]:
# click the 'original' link to download the high resolution image #3
browser.click_link_by_partial_text('syrtis_major_enhanced.tif')

# grab the url
image_3_url = browser.url
image_3_url

'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced'

In [76]:
# append title and image path to dictionary
hemisphere_image_urls = {"title": h3_3, 
                         "img_url": image_3_url
                        }
get_hemi_item = hemisphere_image_urls.items()
# add to list
hi_list.append(get_hemi_item)
hi_list

[dict_items([('title', 'Cerberus Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced')]),
 dict_items([('title', 'Schiaparelli Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced')]),
 dict_items([('title', 'Syrtis Major Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced')])]

In [77]:
# navigate back to the previous page
browser.back()

# get name of hemisphere #4
h3_4 = browser.find_by_tag('h3')[3]
h3_4 = h3_4.value

# click the link #4
browser.click_link_by_partial_text('valles_marineris_enhanced')

In [79]:
# click the 'original' link to download the high resolution image #4
browser.click_link_by_partial_text('valles_marineris_enhanced.tif')

# grab the url
image_4_url = browser.url
image_4_url

'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'

In [80]:
# append title and image path to dictionary
hemisphere_image_urls = {"title": h3_4, 
                         "img_url": image_4_url
                        }
get_hemi_item = hemisphere_image_urls.items()
# add to list
hi_list.append(get_hemi_item)
hi_list

[dict_items([('title', 'Cerberus Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced')]),
 dict_items([('title', 'Schiaparelli Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced')]),
 dict_items([('title', 'Syrtis Major Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced')]),
 dict_items([('title', 'Valles Marineris Hemisphere Enhanced'), ('img_url', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced')])]

In [None]:
# END OF JUPYTER section

# Step 2
# convert the jupyter notebook into a python script called scrape_mars.py with a function called scrape
# the function will execute all of the scraping code from above and return one python dictionary containing all the
# scraped data
