In [17]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
#Set executable path and set up the url for scraping
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [C:\Users\elizk\.wdm\drivers\chromedriver\win32\90.0.4430.24\chromedriver.exe] found in cache


### Latest News

In [4]:
#Visit the mars nasa news site 
url = 'https://redplanetscience.com'
browser.visit(url)
# Add a delay for loading the page then search for components with the tag div and attribute list_text
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

In [6]:
#Set up the HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
#assign slide_elem as the variable to look for the ,div/> tag and it's descendent 
slide_elem = news_soup.select_one('div.list_text')

In [8]:
# Use the parent element, slide_elem to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title
#when .get_text() was chained onto the metho .find(), only the text of the element is returned

'Two Rovers to Roll on Mars Again: Curiosity and Mars 2020'

In [9]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'They look like twins. But under the hood, the rover currently exploring the Red Planet and the one launching there this summer have distinct science tools and roles to play.'

### Featured Images

In [10]:
# to get to the full sized version of the featured image on the webpage
#we'll need to click on the image a few times, this requires splinter

# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [11]:
#there are three buttons on the page so this needs to specify the 
#full image button as the first one...

# Find and click the full image button, identified by its tag
full_image_elem = browser.find_by_tag('button')[1]
#use splinter to click it
full_image_elem.click()

In [13]:
# Parse the resulting html (new page opened) with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [14]:
#we want to pull whatever image comes up each time the code runs
#not the same image every time
#tell BeautifulSoup to look inside the <img /> tag for an image with the class "fancybox-image" seen in the
#dev tools and use .get('src') to pull the link to the image

# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars2.jpg'

In [15]:
#the url pulled above is only the partial url that needs to be 
#added to the url base so we can access the photo so we'll add them together here
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

### Mars Facts

In [18]:
#we want to pull a table of facts from a website and use the same table 
#format in our webpage
#instead of scraping each row, we'll use the .read)html() function to scrape the entire table
#create a new dataframe from the HTML table
df = pd.read_html('https://galaxyfacts-mars.com')[0]
#assing columns to the new DataFrame for clarity
df.columns=['description', 'Mars', 'Earth']
#use set_index() to turn the description column into the index, inplace=True means the updated index will remain in place
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [19]:
#use the pandas function, .to_html() to convert the Dataframe back into HTML-ready code
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [20]:
#end the splinter session
browser.quit()

In [None]:
#jupyter notebook was good to write the code in testable chunks
#but it cannot be run automatically in jupyter notebook so it has
#to be converted into a .py file
#to do so: file>download as> Python(.py)
#if you get a warning click "Keep" to continue downloading