In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [2]:
# import Pandas
import pandas as pd

In [3]:
# Import Splinter and BeautifulSoupa
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [4]:
executable_path = {'executable_path' : ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




#### Assigning the url and instructing the browser to visit it. Searching for elements with specific combination of tag div and attribute list_text. Telling the browser to wait 1 second before searching for components. 

In [5]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

###### Assigning slide_elem as the variable to look for the div and its descendent (the other tags within the div element).  The dot (.) is used for selecting classes such as list_text, so div.list_text pinspoints the div tag with the class of list_text.  CSS goes from right to left, so the last item of the list will be returned first. Because of ths when using select_one, the first matching element returned will be li element with a class of slide and all nested elements. 

In [6]:
# Setting up the HTML parser. 
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

###### We chain .find onto variable slide_elem.  This way we are saying this variable holds lot of information so look inside of that information to find this specific data (the content title, which we specified by div with a class of content_title. 

In [7]:
slide_elem.find('div', class_='content_title')

<div class="content_title">While Stargazing on Mars, NASA's Curiosity Rover Spots Earth and Venus</div>

##### The " < a>" element's most important attribute is the href attribute, which indicates the link's destination. The .get_text() method added to the .find() method returns only the texte of the element. For example, only the tittl not any of the HTML tags or elements. 

In [8]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

"While Stargazing on Mars, NASA's Curiosity Rover Spots Earth and Venus"

##### To get summary text of the article we need to change the attributte to to div and class=article_teaser_body
##### As there are many articles which a tag of div and a class of article_teaser_body. We want to pull the most recent one (normally is the first one on the list).  Using find() method. 

In [9]:
# Use the parent element to find the paragraph text. 

news_p = slide_elem.find('div',class_='article_teaser_body').get_text()
news_p

"This new portrait of the Red Planet's neighbors was taken during a time when there's more dust in the air on Mars."

### Featured Images

In [10]:
# Setting up the URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [11]:
# Find and click the full image button.  full_image_elem is the variable to hold the scraped image.
# With .click splinter will click the image to view its full size. With .find() the browser will find an element by its tag.
# With the indexing 1 the browser will automatically click the second button and change the view to a slideshow of images. 

full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [12]:
# With the new page loaded onto the automated browser,it needs to be parsed to continue and scrape the full size image URL
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [13]:
# Finding the relative image URL. Using img and fancybox-img to build the URL to the full-size image. 
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars1.jpg'

In [14]:
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars1.jpg'

In [15]:
# Scraping the entire table with Pandas .read_html() function.
df = pd.read_html('https://galaxyfacts-mars.com')[0]
df.columns=['description','Mars', 'Earth']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [16]:
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [17]:
# Ending the automated browsing session.
browser.quit()