# We want to parse the site to obtain information on the newest article

# Import Dependencies and Setup

In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

# Import Pandas
import pandas as pd

In [2]:
# Setup the browser as chrome to run our web scrape 
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# Web Scrape

### Mars Articles

In [3]:
# Define and visit the webpage url
url = 'https://redplanetscience.com'
browser.visit(url)
# search for elements with the specific combination of <div /> tag and list_text attribute 
# Optional 1 second delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

In [4]:
# Parse thorugh the html on the webpage 
html = browser.html
news_soup = soup(html, 'html.parser')
# the . tells us to look at that class, so this searches the <div /> tag and list_text attribute
# css usually looks left to right, so it will find the last value, that is why we use 'select_one' to get the first element
slide_elem = news_soup.select_one('div.list_text')

# When inspecting the webpage the below is the html code
slide_elem

<div class="list_text">
<div class="list_date">April 25, 2021</div>
<div class="content_title">NASA's Mars 2020 Rover Tests Descent-Stage Separation</div>
<div class="article_teaser_body">A crane lifts the rocket-powered descent stage away from NASA's Mars 2020 rover after technicians tested the pyrotechnic charges that separate the two spacecraft.</div>
</div>

In [5]:
# Find the title which we see is in the <div /> tag and 'content_title' class
slide_elem.find('div', class_='content_title')

<div class="content_title">NASA's Mars 2020 Rover Tests Descent-Stage Separation</div>

In [6]:
# Use .get_text() to parse just the text from the above code 
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

"NASA's Mars 2020 Rover Tests Descent-Stage Separation"

In [7]:
# Find the article body by doing .find on that tag and class then get just the text 
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

"A crane lifts the rocket-powered descent stage away from NASA's Mars 2020 rover after technicians tested the pyrotechnic charges that separate the two spacecraft."

### Featured Images

In [8]:
# Define and visit the url
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [9]:
# On that browser it opens up a webpage where we want the first image 
# There is a button to click to enlarge/get the full image which we want our code to press automatically 
# If we inspect the html code for the button the code is: <button class="btn btn-outline-light"> FULL IMAGE</button> 
# Doing a search on the rest of the html we see there are 2 other <button /> tags

# We willl find the element by the tag 'button' and place it in a variable
# We know there are 2 other instances of the <button /> tag and know that the button we want is the 2nd one so we call that index
full_image_elem = browser.find_by_tag('button')[1]
# Then tell the code to click the button/variable
full_image_elem.click()

In [10]:
# Now that we have opened the enlarged image, we need to parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')
#img_soup

# The html code for the enlarged image is: <img class="fancybox-image" src="image/featured/mars1.jpg" alt="">

In [11]:
# We will use .find to look at the img tag and the class, then get just the src value
# We want to get the src value rather than the value itself because the value will update 
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars2.jpg'

In [12]:
# Now that we have the most recent web image location we can use a string concatination to add it to the base url 
# This will allow us to generate a link to the most recent image even if it updates 
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
print(img_url)

https://spaceimages-mars.com/image/featured/mars2.jpg


### Mars Facts

In [13]:
# The information we want is in a table on the webpage which is formatted as so
# We can see that it's already in a table format with the table being the class 
    # followed by the body
        # followed by each row of the table
            # which contains a columnn for the header 
            # and a column for the data/values

# <table class="table table-striped">
# 				  <tbody>
# 				    <tr>
# 				      <th scope="row">Equatorial Diameter:</th>
# 				      <td>6,792 km</td>
# 				    </tr>
# 				    <tr>
# 				      <th scope="row">Polar Diameter:</th>
# 				      <td>6,752 km</td>
# 				    </tr>
# 				  </tbody>
# 				</table>

In [14]:
# We can use Panda's built in function .read_html to read tables from html, and we specify index as 0 to get the first table
# Pandas then stores it into a DataFrame 
df = pd.read_html('https://galaxyfacts-mars.com')[0]
# Specify the column headers for the DataFrame
df.columns=['description', 'Mars', 'Earth']
# Set the index for the for the DataFrame
# inplace=True means that the updated index will remain in place, without having to reassign the DataFrame to a new variable
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [15]:
# Because we want to update/pass this DataFrame to our own webpage, we need to convert it back to html format 
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [16]:
# Quit the automated browser session
browser.quit()