In [3]:
# Import dependencies
from bs4 import BeautifulSoup as bs
import requests 
from splinter import Browser
import pandas as pd

In [5]:
# Start browser session for scraping
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
# Navigate to website 
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
browser.visit(url)
# Prepare soup for scraping
html = browser.html
soup = bs(html, 'html.parser')

In [7]:
print(soup.body.prettify())

<body id="news" style="">
 <svg display="none" height="0" width="0">
  <symbol height="30" id="circle_plus" viewbox="0 0 30 30" width="30">
   <g fill-rule="evenodd" transform="translate(1 1)">
    <circle cx="14" cy="14" fill="#fff" fill-opacity=".1" fill-rule="nonzero" r="14" stroke="inherit" stroke-width="1">
    </circle>
    <path class="the_plus" d="m18.856 12.96v1.738h-4.004v3.938h-1.848v-3.938h-4.004v-1.738h4.004v-3.96h1.848v3.96z" fill="inherit" stroke-width="0">
    </path>
   </g>
  </symbol>
  <symbol height="30" id="circle_arrow" viewbox="0 0 30 30" width="30" xmlns="http://www.w3.org/2000/svg">
   <g transform="translate(1 1)">
    <circle cx="14" cy="14" fill="#fff" fill-opacity=".1" r="14" stroke="inherit" stroke-width="1">
    </circle>
    <path class="the_arrow" d="m8.5 15.00025h7.984l-2.342 2.42c-.189.197-.189.518 0 .715l.684.717c.188.197.494.197.684 0l4.35-4.506c.188-.199.188-.52 0-.717l-4.322-4.48c-.189-.199-.496-.199-.684 0l-.684.716c-.189.197-.189.519 0 .716l2.3

In [8]:
# Scrape first news title and its article text
news_title = soup.select('div.content_title > a')[0].text
print(news_title)
news_p = soup.select('div.article_teaser_body')[0].text
print(news_p)

MOXIE Could Help Future Rockets Launch Off Mars
NASA's Perseverance rover carries a device to convert Martian air into oxygen that, if produced on a larger scale, could be used not just for breathing, but also for fuel.


In [9]:
# Alternate code for scraping above data
# news_title = soup.find_all('div', class_='content_title')[1].text
# news_p = soup.find_all('div', class_='article_teaser_body')[0].text
# print(news_title)
# print(news_p)

In [10]:
# Navigate to next website
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
# Prepare soup for scraping
html = browser.html
soup = bs(html, 'html.parser')

In [11]:
# Use splinter to navigate through web pages
browser.click_link_by_id('full_image')
browser.links.find_by_text('more info     ').click()
featured_image = browser.links.find_by_partial_href('hires').click()
# Set current url of browser to variable
featured_image_url = browser.url
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA17652'

In [12]:
# Navigate to next website
url = 'https://space-facts.com/mars/'
# Read html into pandas
table = pd.read_html(url)[0]
# Rename columns
table.columns = ['Description', 'Mars']
# Store table data in dictionary
table = table.to_dict('records')
table

[{'Description': 'Equatorial Diameter:', 'Mars': '6,792 km'},
 {'Description': 'Polar Diameter:', 'Mars': '6,752 km'},
 {'Description': 'Mass:', 'Mars': '6.39 × 10^23 kg (0.11 Earths)'},
 {'Description': 'Moons:', 'Mars': '2 (Phobos & Deimos)'},
 {'Description': 'Orbit Distance:', 'Mars': '227,943,824 km (1.38 AU)'},
 {'Description': 'Orbit Period:', 'Mars': '687 days (1.9 years)'},
 {'Description': 'Surface Temperature:', 'Mars': '-87 to -5 °C'},
 {'Description': 'First Record:', 'Mars': '2nd millennium BC'},
 {'Description': 'Recorded By:', 'Mars': 'Egyptian astronomers'}]

In [13]:
# Navigate to next website
base_url = 'https://astrogeology.usgs.gov'
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
# Prepare soup for scraping
html = browser.html
soup = bs(html, 'html.parser')

In [14]:
print(soup.body.prettify())

<body id="results">
 <header>
  <!--
			<h1>Astrogeology Science Center</h1>
-->
  <a href="https://www.usgs.gov/centers/astrogeo-sc" style="float:right;margin-top:10px;" target="_blank">
   <img alt="USGS: Science for a Changing World" class="logo" height="60" src="/images/usgs_logo_main_2x.png"/>
  </a>
  <a href="https://nasa.gov" style="float:right;margin-top:5px;margin-right:20px;" target="_blank">
   <img alt="NASA" class="logo" height="65" src="/images/logos/nasa-logo-web-med.png"/>
  </a>
  <a href="https://www.usgs.gov/centers/astrogeology-science-center/science/pds-cartography-and-imaging-sciences-node-usgs" style="float:right;margin-top:5px;margin-right: 10px;" target="_blank">
   <img alt="PDS Cartography and Imaging Science Node" class="logo" height="65" src="/images/pds_logo-invisible-web.png"/>
  </a>
 </header>
 <div class="wrapper">
  <!--
			<nav>
				<a id="nav-toggle" href="#" title="Navigation Menu">Menu</a>
<ul class="dropdown dropdown-horizontal" id="yw0">
<li><a

In [15]:
# Use splinter to find all links with <a> tag
links = soup.select('div.item > a')
links

[<a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/syrtis_major_enhanced"><img alt="Syrtis Major Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/55a0a1e2796313fdeafb17c35925e8ac_syrtis_major_enhanced.tif_thumb.png"/></a>,
 <a class="itemLink product-item" href="/search/map/Mars/Viking/valles_marineris_enhanced"><img alt="Valles Marineris Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/4e59980c1c57f89c680c0e1ccabbeff1_valles_marineris_enhanced.tif_thumb.png"/></

In [16]:
# Use for loop to find all <a> tags with href links
for link in links:
    href = link['href']
    print(href)

/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced


In [17]:
# Create variable to insert beginning part of url
base_url = 'https://astrogeology.usgs.gov'
# Add beginning part of url to all href links found
links_list = [base_url + link['href'] for link in links]
links_list

['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']

In [18]:
# Create 2 empty lists to store titles and links 
img_title_list = []
img_url_list = []
# Create for loop to navigate through each store website link and obtain hemisphere image urls and names
for link in links_list:
    browser.visit(link)
    html = browser.html
    soup = bs(html, 'html.parser')
    img_title = soup.find('h2', class_='title').text.split(' Enhanced')[0]
    img_title_list.append(img_title)
    browser.links.find_by_text('Sample').click()
    img_url = browser.windows[1].url
    img_url_list.append(img_url)
    browser.windows[1].close()
    browser.back()

In [19]:
img_url_list

['https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg']

In [20]:
img_title_list

['Cerberus Hemisphere',
 'Schiaparelli Hemisphere',
 'Syrtis Major Hemisphere',
 'Valles Marineris Hemisphere']

In [21]:
# Store url and title list into dictionary
hemisphere_image_urls = [{'title': title, 'img_url': url} for title, url in zip(img_title_list, img_url_list)]
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [22]:
# Quite browser
browser.quit()