In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
import numpy as np
import os

In [2]:
# Initialize Dictionary to hold scrape results
scrape_results = {}

In [3]:
!which chromedriver

/usr/local/bin/chromedriver


In [4]:
# Establish browser to call up URLs for scraping
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [5]:
# Call up Mars news url for scraping
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

In [6]:
# Use Beautiful Soup to parse out the url
html = browser.html
soup = BeautifulSoup(html, "html.parser")


In [7]:
# Set the parent level division for parsing
news = soup.find('div', class_='content_title')
news_title = news.select_one('a').text

news_p = soup.find('div', class_='article_teaser_body').text

# Insert results into Dictionary scrape_results
scrape_results['news_title'] = news_title
scrape_results['news_text'] = news_p

In [8]:
# Call up the Jet Propulsion Labs website for latest featured Mars image
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [9]:
# Scrape for latest image and store in dictionary
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
image = soup.find('section', class_='centered_text clearfix main_feature primary_media_feature single')
link = image.find('a')['data-fancybox-href']
featured_image_url = f'https://www.jpl.nasa.gov{link}'
scrape_results['featured_image_url'] = featured_image_url

In [10]:
# Twitter URL of page to be scraped
url = 'https://twitter.com/marswxreport?lang=en'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [11]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_='js-tweet-text-container')

weather_report = []
for result in results:
    report = result.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text
    
    weather_report.append(report)

scrape_results['mars_weather'] = weather_report[0]


In [12]:
# Use pandas to read in table data from space_facts site re: Mars facts
url = 'http://space-facts.com/mars/'

tables = pd.read_html(url)


In [13]:
# convert tables into a Pandas Dataframe
df = tables[0]
df.columns = ['Characteristic', 'Measure']
df.head(10)

Unnamed: 0,Characteristic,Measure
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [14]:
# Convert Pandas dataframe into an html formatted table
marsfacts_html_table = df.to_html()
marsfacts_html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Characteristic</th>\n      <th>Measure</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd mille

In [16]:
# Add the mars facts html table to our dictionary
scrape_results['mars_facts'] = marsfacts_html_table

In [17]:
# URL of page to be scraped
url = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

# Retrieve the parent div for all hemispheres
result = soup.find('div', class_='downloads')

cerberus_url = result.a['href']

title = soup.find('div', class_='content')

cerberus_title = title.find('h2', class_='title').text

# save as a python dictionary object
scrape_results['cerberus_title'] = cerberus_title
scrape_results['cerberus_url'] = cerberus_url

In [18]:
# URL of page to be scraped
url = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

# Retrieve the parent div for all hemispheres
result = soup.find('div', class_='downloads')

schiaparelli_url = result.a['href']

title = soup.find('div', class_='content')

schiaparelli_title = title.find('h2', class_='title').text

scrape_results['schiaparelli_title'] = schiaparelli_title
scrape_results['schiaparelli_url'] = schiaparelli_url


In [19]:
# URL of page to be scraped
url = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

# Retrieve the parent div for all hemispheres
result = soup.find('div', class_='downloads')

syrtis_major_url = result.a['href']

title = soup.find('div', class_='content')

syrtis_major_title = title.find('h2', class_='title').text

scrape_results['syrtis_major_title'] = syrtis_major_title
scrape_results['syrtis_major_url'] = syrtis_major_url


In [20]:
# URL of page to be scraped
url = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

# Retrieve the parent div for all hemispheres
result = soup.find('div', class_='downloads')

valles_marineris_url = result.a['href']

title = soup.find('div', class_='content')

valles_marineris_title = title.find('h2', class_='title').text

scrape_results['valles_marineris_title'] = valles_marineris_title
scrape_results['valles_marineris_url'] = valles_marineris_url


In [22]:
print(scrape_results)

{'news_title': 'NASA InSight Landing on Mars: Milestones', 'news_text': "On Nov. 26, NASA's InSight spacecraft will blaze through the Martian atmosphere and set a lander gently on the surface in less time than it takes to cook a hard-boiled egg.", 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA14254_ip.jpg', 'mars_weather': 'Sol 2234 (2018-11-18), high 2C/35F, low -70C/-93F, pressure at 8.57 hPa, daylight 06:25-18:41', 'mars_facts': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Characteristic</th>\n      <th>Measure</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n    