In [None]:
# 10.3.3 Scrape Mars Data: The News

# The script we're building is designed to scrape the most recent data—that means that each time we run the script, 
# we'll pull the newest data available.


In [27]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
#add this because next cell says chromedrivermanager is not defined
# from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [29]:
# Set the executable path and initialize the chrome browser in splinter (code from the reading, uses a manual path but 
# when i attempted to use the downloadabele chromedriver and installed it into the path, my mac said it didn't trust it.

# executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# browser = Browser('chrome', **executable_path)

# Set the executable path and initialize chrome browser in splinter (alternate way)
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)


[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280


 


[WDM] - Driver [/Users/josefanolin/.wdm/drivers/chromedriver/mac64/87.0.4280.88/chromedriver] found in cache


In [30]:
#  we'll assign the url and instruct the browser to visit it.

# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)



True

In [None]:
# The line, browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1), does two things.

# One is that we're searching for elements with a specific combination of tag (ul and li) and attribute (item_list 
# and slide, respectively). For example, ul.item_list would be found in HTML as <ul class=”item_list”>.

# Secondly, we're also telling our browser to wait one second before searching for components. 
# The optional delay is useful because sometimes dynamic pages take a little while to load, especially if they are 
# image-heavy.

In [31]:
# set up the HTML parser:

html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [None]:
# Notice how we've assigned slide_elem as the variable to look for the <ul /> tag and its descendent (the other tags
# within the <ul /> element), the <li /> tags? 

# This is our parent element. 
# This means that this element holds all of the other elements within it, and we'll reference it when we want to 
# filter search results even further. 

# The . is used for selecting classes, such as item_list, so the code 'ul.item_list li.slide' pinpoints the <li /> tag
# with the class of slide and the <ul /> tag with a class of item_list. 

# CSS works from right to left, such as returning the last item on the list instead of the first. 

# Because of this, when using select_one, the first matching element returned will be a <li /> element with a class of 
# slide and all nested elements within it.

# After opening the page in a new browser, right-click to inspect and activate your DevTools. 
# Then search for the HTML components you'll use to identify the title and paragraph you want.

In [None]:
# Which HTML attribute will we use to scrape the article’s title?

# We’re looking for a <div /> with a class of “content_title.”

# class = “content_title”

In [32]:
# We'll want to assign the title and summary text to variables we'll reference later. 
# let's begin our scraping

# In this line of code, we'll chain .find onto our previously assigned variable, slide_elem. 
# When we do this, we're saying, "This variable holds a ton of information, so look inside of that information to find 
# this specific data." 

# The data we're looking for is the content title, which we've specified by saying, "The specific data is in a <div /> 
# with a class of 'content_title'."

#  run this cell. The output should be the HTML containing the content title and anything else nested inside of 
# that <div />.

slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8822/a-martian-roundtrip-nasas-perseverance-rover-sample-tubes/" target="_self">A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes</a></div>

In [33]:
# we need to get just the text, and the extra HTML stuff isn't necessary.

# Use the parent element to find the first `a` tag and save it as `news_title`
# the get_text(). when used is chained onto .find(), so only the text of the element is returned. 
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

"A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes"

In [17]:
# Earlier, we identified the parent element and created a variable to hold it. With this new code, we’re searching 
# within that element for the title. We’re also stripping the additional HTML attributes and tags with the use of 
# .get_text().

# Once executed, the result is the most recent title published on the website. 
# When the website is updated and a new article is posted, when our code is run again, it will return that article 
# instead.



In [None]:
# We have the title we want, and that's a great start. Next we need to add the summary text. 

# slide_elem.find(“div”, class_=‘content_title’).get_text() is our previous code. 
# We’ll need to change the class to “article_teaser_body.”

# Before we can update our code, we'll need to use our DevTools to make sure we're scraping the right tag and class. 
# We know that "article_teaser_body" is the right class name, but when we search for it, there is more than one result. 
# What now?

# We want to pull the first one on the list, not a specific one, so more than 40 results is fine. 
# In this case, if our scraping code is too specific, we'd pull only that article summary instead of the most recent.

In [None]:
# There are two methods used to find tags and attributes with BeautifulSoup:

# .find() is used when we want only the first class and attribute we've specified.
# .find_all() is used when we want to retrieve all of the tags and attributes.

# For example, if we were to use .find_all() instead of .find() when pulling the summary, we would retrieve all of 
# the summaries on the page instead of just the first one.

In [34]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

"Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. "

In [None]:
# 10.3.4 Scrape Mars Data: Featured Image

# Robin's next step scraping code will be to gather the featured images from NASA's Space Images webpage. 
# In your Jupyter notebook, use markdown to separate the article scraping from the image scraping.

# In the next empty cell, type ### Featured Images and change the format of the code cell to "Markdown."
# The cell below this one is where we'll begin our scraping. First, let's check out the webpage.

# The first image that pops up on the webpage is the featured image. 
# Robin wants the full-size version of this image, so we know we'll want Splinter to click the "Full Image" button. 
# From there, the page directs us to a slideshow. 

# It's a little closer to getting the full-size feature image, but we aren't quite there yet.

# Click the "More Info" button on the page. Click image again to get to full size image. 
# Begin code ready to automate all of the clicks.

### Featured Images 

In [35]:
# Visit URL
# A new automated browser should open to the featured images webpage.

url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [None]:
# Next, we want to click the "Full Image" button. This button will direct our browser to an image slideshow. 
# Let's take a look at the button's HTML tags and attributes with the DevTools.


<a class="button fancybox" data-description="Scientists produced new global maps of Jupiter using the Wide Field Camera 3 on NASA's Hubble Space Telescope. One color map is shown here, projected onto a globe and as a flat image." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA19643_ip.jpg" data-link="/spaceimages/details.php?id=PIA19643" data-title="Spinning Jupiter and Global Map" id="full_image">
					FULL IMAGE
				  </a>

In [None]:
# Near the end of the attributes in the <a /> tag is id=“full_image”. This is significant because in HTML, an id is a 
# completely unique identifier. 

# Often, a class is used as an identifier, but only for other HTML tags with similar styling. 

# For example, when we were scraping the articles, we saw that all of the articles had the same class. 
# None of the other components of that webpage had that class, though.

# An id, on the other hand, can only be used one time throughout the entire page.



In [36]:
# Because we want to click the full-size image button, we can go ahead and use the id in our code. 

# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [37]:
# Find the more info button and click that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.links.find_by_partial_text('more info')
more_info_elem.click()

In [38]:
# With the new page loaded onto our automated browser, it needs to be parsed so we can continue and scrape the 
# full-size image URL. 

# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [39]:
# We'll use all three of these tags (<figure />, <a />, and <img />) to build the URL to the full-size image. 

# Find the relative image url
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA18432_hires.jpg'

In [40]:
# Let's add the base URL to our code.

# img_url = f'https://www.jpl.nasa.gov{img_url_rel}'

# This variable holds our f-string.
#img_url

# This is an f-string, a type of string formatting used for print statements in Python.
# f'https://www.jpl.nasa.gov

# The curly brackets hold a variable that will be inserted into the f-string when it's executed.
# {img_url_rel}


# Use the base URL to create an absolute URL
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18432_hires.jpg'

In [41]:
# Scrape the Facts table 

# df = pd.read_html('http://space-facts.com/mars/')[0] 
# With this line, we're creating a new DataFrame from the HTML table. 
# The Pandas function read_html() specifically searches for and returns a list of tables found in the HTML. 
# By specifying an index of 0, we're telling Pandas to pull only the first table it encounters, or the first item in 
# the list. Then, it turns the table into a DataFrame.

# df.columns=['description', 'value'] 
# Here, we assign columns to the new DataFrame for additional clarity.

# df.set_index('description', inplace=True) 
# By using the .set_index() function, we're turning the Description column into the DataFrame's index. 
# inplace=True means that the updated index will remain in place, without having to reassign the DataFrame to a new 
# variable.

df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [42]:
df.to_html()


'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [43]:
browser.quit()

In [None]:
# See "Mission_to_Mars.py" for striped down version of this for reference for module challenge 