In [33]:
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import pymongo
import io


In [19]:
# executable_path = {'executable_path': ChromeDriverManager().install()}
executable_path = {'executable_path': 'chromedriver.exe'} # Ed needs this to run on his PC
browser = Browser('chrome', **executable_path, headless=False)

## --------------------------------------------------------
## This is Part 1 - Scrape the Sites for info
## --------------------------------------------------------

### Part 1-1 - Scrape the NASA Mars News Site

In [20]:
# Open browser to scrape the Nasa Mars News Site and collect the latet News Title and Paragraph Text. 
# URL = https://mars.nasa.gov/news/
browser.visit('https://mars.nasa.gov/news/')

# pause for brower to open
time.sleep(1)

In [21]:
# Pull in the HTML from the page
html = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify()) # if need to see what was pulled

In [22]:
# Scan and pull the content_title div
news_title_list = soup.find_all('div', class_='content_title')

# Pull the text from the second item, "1" which is the title
news_title = news_title_list[1].text
print(news_title)

NASA's Curiosity Mars Rover Takes Selfie With 'Mont Mercou'


In [23]:
# Scan and pull the article_teaser_body div
news_paragraph_list = soup.find_all('div', class_='article_teaser_body')

# Pull the text from the first item, "1" which is the short paragraph
news_p = news_paragraph_list[0].text
print(news_p)

The rover also snapped a pair of panoramas to create a 3D view of the stark cliff face featured in the selfie.


### Part 1-2 - Scrape the JPL Mars Space Images - for Featured Image

In [24]:
# Open browser to scrape the JPL Site and collect the items and images 
# URL = https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
browser.visit('https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars')

# pause for brower to open
time.sleep(1)

In [25]:
# Pull in the HTML from the page
html = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify()) # if need to see what was pulled

In [26]:
# Scan and pull the first SearchResultCard div
jpl_small_images_list = soup.find('div', class_='SearchResultCard')

# Pull the 'href' item which is what must be 'clicked' to get to next page
jpl_small_image = jpl_small_images_list.a['href']
print(jpl_small_image)

/images/high-energy-and-junos-stellar-reference-unit


In [27]:
# Call browser to 'click' on the small image to get to the large one
browser.click_link_by_href(jpl_small_image)

# pause for brower to open
time.sleep(1)

In [28]:
# Pull in the HTML from the secondary page
html = browser.html

# Parse this new HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify()) # if need to see what was pulled

In [29]:
# Scan and pull the BaseImagePlaceholder div
jpl_large_images_list = soup.find_all('div', class_='BaseImagePlaceholder')
# print(jpl_large_images_list)

In [30]:
# pull the url from the img data-src attribute for 1st "0" item
featured_image_url = jpl_large_images_list[0].img["data-src"]
print(featured_image_url)

https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24436.width-1024.jpg


### Part 1-3 - Scrape the Mars Facts page for - Mars Facts Table

In [31]:
# Use Pandas to scrape the Space-Facts.com site for the data table on mars 
# url = "https://space-facts.com/mars/"
url = "https://space-facts.com/mars/"

In [34]:
# This is the Pandas method to pull all the tables from the page
tables = pd.read_html(url)
# tables

In [35]:
# The Mars table is the first "0" table on the page...
mars_table = tables[0]

In [36]:
# Per the requrested format, rename the columns...
mars_table.rename(columns={0:"Description",1:"Mars"},inplace= True)

# and rename the index
mars_table.set_index("Description",inplace=True)

In [41]:
# Use the StringIO function to put into a string
str_io = io.StringIO()

mars_table.to_html(buf=str_io, classes='table')

mars_table_html_string = str_io.getvalue()
# print(mars_table_html_str)

### Part 1-4 - Scrape the Astrogeology site for - High resolution Mars hempsphere pictures

In [43]:
# Scrape the Astrogeology Site and collect the items and images 
# breaking up the given url into 2 pieces since just the top portion is needed later
# Complete URL is: https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars
# Break URL into pieces and combine for first search
base_url = 'https://astrogeology.usgs.gov/'
search_adder = 'search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

# browse the url
browser.visit(base_url + search_adder) 

# pause for brower to open
time.sleep(1)

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=60254): Max retries exceeded with url: /session/b1f6abce3e67d5f65a24f808aabaec5e/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000017CC53D86A0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it',))

In [None]:
# Pull in the HTML from the page
html = browser.html

# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify()) # if need to see what was pulled

### Part 1-5 - Clean up browser and any other housekeeping

In [40]:
browser.quit()

### Part 1-6 - Create dictionary and store the scraped data into the dictionary to pass back

In [42]:
# set up dictionary to pass values back to calling program when this is an app
mars_dictionary = []
mars_dictionary = {
    'news_title': news_title,
    'news_p': news_p,
    'featured_image_url': featured_image_url,
    'mars_table_html_string': mars_table_html_string
    
}
print(mars_dictionary)

{'news_title': "NASA's Curiosity Mars Rover Takes Selfie With 'Mont Mercou'", 'news_p': 'The rover also snapped a pair of panoramas to create a 3D view of the stark cliff face featured in the selfie.', 'featured_image_url': 'https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24436.width-1024.jpg', 'mars_table_html_string': '<table border="1" class="dataframe table">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n     