## Dependencies & Set-up

In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup as soup

In [2]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path)

## Nasa Mars article scraping

In [3]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [4]:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [5]:
# Scrape the title of the first article
slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8848/nasas-perseverance-pays-off-back-home/" target="_self">NASA's Perseverance Pays Off Back Home</a></div>

In [6]:
# Use parent elememt to find first 'a' tag and save it
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

"NASA's Perseverance Pays Off Back Home"

In [7]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'Even as the Perseverance rover approaches Mars, technology on board is paying off on Earth.'

## Nasa JPL Image Scraping

In [8]:
# Visit URL
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

In [9]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [10]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [11]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars1.jpg'

In [12]:
# Use the base URL to create an absolute URL
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
img_url

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars1.jpg'