# STEP 1:  Web Scraping

In [1]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import pymongo
import pandas as pd
import time

In [2]:
# Windows Users
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
time.sleep(1)

## NASA Mars News

In [3]:
# Visit the url for Mars News using Splinter
news_url = 'https://mars.nasa.gov/news/'
browser.visit(news_url)
time.sleep(1)

In [4]:
# Create BeautifulSoup object; parse with 'html'
news_html = browser.html
news_soup = BeautifulSoup(news_html, 'html.parser')

In [5]:
# Examine the results, then determine element that contains sought info (Latest News Title)
list_text = news_soup.find('div', class_='list_text')
news_title = list_text.find('div', class_='content_title').find('a').text

# Print the scraped information
print(news_title)

How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus 


In [6]:
# Examine the results, then determine element that contains sought info (Latest News Paragraph)
news_p = list_text.find('div', class_='article_teaser_body').text

# Print the scraped information
print(news_p)

Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.


## JPL Mars Space Images - Featured Image

In [7]:
# Visit the url for JPL Featured Space Image using Splinter
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url)
time.sleep(1)

# Find the Full Image Button on the Website
button = browser.links.find_by_partial_text('FULL IMAGE')
time.sleep(1)

In [8]:
# Create BeautifulSoup object; parse with 'html' (1st Website Page)
image_html = browser.html
image_soup = BeautifulSoup(image_html, 'html.parser')

In [9]:
# Examine the results, then determine element that contains sought info (Latest Image)
footer = image_soup.find('a', class_="button fancybox")

# Print the scraped information
print(footer)

<a class="button fancybox" data-description="This image was taken by the Optical, Spectroscopic, and Infrared Remote Imaging System, Rosetta's main onboard scientific imaging system, on Sept. 10, 2014. Jets of cometary activity can be seen along almost the entire body of the comet." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA18886_ip.jpg" data-link="/spaceimages/details.php?id=PIA18886" data-title="Rosetta Comet Spreads its Jets" id="full_image">
					FULL IMAGE
				  </a>


In [10]:
# Click the Full Image Button on the Website
button.click()

In [11]:
# Create BeautifulSoup object; parse with 'html' (2nd Website Page after Clicking Button)
full_image_html = browser.html
full_image_soup = BeautifulSoup(full_image_html, 'html.parser')

In [12]:
# Examine the results, then determine element that contains sought info (Latest Full Image after Clicking Button)
full_image = full_image_soup.find('img', class_='fancybox-image')

# Print the scraped information
print(full_image)

None


In [13]:
# Extract the inner attribute
image_url = full_image.attrs['src']

# Print the scraped information
image_url

AttributeError: 'NoneType' object has no attribute 'attrs'

In [None]:
# Combine main url page with the full image url for the final url
main_jpl_url = "https://www.jpl.nasa.gov"
featured_image_url = f'{main_jpl_url}{image_url}'

# Print the scraped information
print(featured_image_url)

## Mars Weather

In [None]:
# Visit the url for the Mars Weather Twitter Account using Splinter
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)
time.sleep(1)

#tab = browser.links.find_by_partial_text('Tweets')
#time.sleep(1)

In [None]:
# Create BeautifulSoup object; parse with 'html'
weather_html = browser.html
weather_soup = BeautifulSoup(weather_html, 'html.parser')

In [None]:
# Examine the results, then determine element that contains sought info (Latest Tweet)
weather_timeline = weather_soup.find('div', attrs={"aria-label": "Timeline: Mars Weather’s Tweets"})

# Print the scraped information
print(weather_timeline)

In [None]:
# Examine the results, then determine element that contains sought info (Latest Tweet)
latest_tweet = weather_timeline.find('div',
                                     attrs={'lang': 'en',
                                           'dir': 'auto',
                                           'class': 'css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0'})
# Print the scraped information
print(latest_tweet)               

In [None]:
# Examine the results, then determine element that contains sought info (Latest Tweet)
mars_weather = latest_tweet.find('span', class_='css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0').text

# Print the scraped information
print(mars_weather)

## Mars Facts

In [None]:
# We can use the read_html function in Pandas to automatically scrape any tabular data from a page.
facts_url = 'https://space-facts.com/mars/'

tables = pd.read_html(facts_url)
tables

In [None]:
# What we get in return is a list of dataframes for any tabular data that Pandas found.
type(tables)

In [None]:
# We can slice off any of those dataframes that we want using normal indexing.
df = tables[0]
df

In [None]:
# Assign the columns `['Description', 'Value']`
df.columns = ['Description','Value']
df

In [None]:
# Set the index to the `Description` column without row indexing
df.set_index('Description', inplace=True)
df

In [None]:
# Save the table directly to a file.

df.to_html('mars_table.html')

## Mars Hemispheres

In [None]:
# Visit the url for the USGS Astrogeology site using Splinter
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

In [None]:
# Create BeautifulSoup object; parse with 'html'
hemispheres_html = browser.html
hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser')

In [None]:
# Print formatted version of the soup
#print(hemispheres_soup.prettify())

In [None]:
# Examine the results, then determine element that contains sought info (Latest Tweet) by looping through Results

for y in range(1, 5):

    hemispheres_html = browser.html
    hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser')

    hemispheres = hemispheres_soup.find_all('a', class_='itemLink product-item')

    for hemisphere in hemispheres:
        print('page:', y, '-------------------------------------------------------------------------------------------')
        print(hemisphere.text)