# STEP 1:  Web Scraping

In [1]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import pymongo
import pandas as pd

## NASA Mars News

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.nasa_db
collection = db.items

In [4]:
# URL of page to be scraped
news_url = 'https://mars.nasa.gov/news/'

# Retrieve page with the requests module
response = requests.get(news_url)

# Create BeautifulSoup object; parse with 'html'
news_soup = BeautifulSoup(response.text, 'html')

In [5]:
# Examine the results, then determine element that contains sought info
print(news_soup.prettify())

<!DOCTYPE html>
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <!-- Always force latest IE rendering engine or request Chrome Frame -->
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"5e33925808","applicationID":"59562082","transactionName":"JVcPR0MLWApSRU1eAQVVEhxSC1oSUlkWbBMHXwRAHhdcCUA=","queueTime":0,"applicationTime":302,"agent":""}
  </script>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).loader_config={xpid:"VQcPUlZTDxAFXVRUBQEPVA==",licenseKey:"5e33925808",applicationID:"59562082"};window.NREUM||(NREUM={}),__nr_require=function(t,n,e){function r(e){if(!n[e]){var o=n[e]={exports:{}};t[e][0].call(o.exports,function(n){var o=t[e][1][n];return r(o||n)},o,o.exports)}return n[e].exports}if("function"==ty

In [6]:
# Examine the results, then determine element that contains sought info (Latest News Title)
news_title = news_soup.find('div', class_='content_title').find('a').text.strip()

# Print the scraped information
news_title

"Mars Helicopter Attached to NASA's Perseverance Rover"

In [7]:
# Examine the results, then determine element that contains sought info (Latest News Paragraph)
news_paragraph = news_soup.find('div', class_='rollover_description_inner').text.strip()

# Print the scraped information
news_paragraph

"The team also fueled the rover's sky crane to get ready for this summer's history-making launch."

## JPL Mars Space Images - Featured Image

In [8]:
# Windows Users
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [9]:
# Visit the url for JPL Featured Space Image using Splinter
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url)

In [10]:
# Create BeautifulSoup object; parse with 'html'
image_html = browser.html
image_soup = BeautifulSoup(image_html, 'html.parser')

In [11]:
# Examine the results, then determine element that contains sought info
print(image_soup.prettify())

<html class="js flexbox canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers applicationcache svg inlinesvg smil svgclippaths -webkit-" style="">
 <!-- START HEADER: "DEFAULT" -->
 <!-- Google Tag Manager -->
 <head>
  <script async="" src="https://www.google-analytics.com/analytics.js" type="text/javascript">
  </script>
  <script src="https://m.addthis.com/live/red_lojson/300lo.json?si=5ea0c9504531978c&amp;bkl=0&amp;bl=1&amp;pdt=763&amp;sid=5ea0c9504531978c&amp;pub=&amp;rev=v8.28.3-wp&amp;ln=en&amp;pc=men&amp;cb=0&amp;ab=-&amp;dp=www.jpl.nasa.gov&amp;fp=spaceimages%2F%3Fsearch%3D%26category%3DMars&amp;fr=&amp;of=1&amp;pd=0&amp;irt=0&amp;vcl=0&amp;md=0&a

In [12]:
# Examine the results, then determine element that contains sought info (Latest Image)
image = image_soup.find('a', class_="button fancybox")
image

<a class="button fancybox" data-description="This false-color image of Peru's Ubinas volcano was acquired on April 14, 2014, by NASA's UAVSAR. Located about 100 miles (160 kilometers) from the city of Arequipa, Ubinas is Peru's most active volcano." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA18049_ip.jpg" data-link="/spaceimages/details.php?id=PIA18049" data-title="UAVSAR image of Ubinas Volcano" id="full_image">
					FULL IMAGE
				  </a>

In [13]:
# Combine main url plus filepath for image
main_jpl_url = "https://www.jpl.nasa.gov"
image_url = "/spaceimages/images/mediumsize/PIA14924_ip.jpg"
featured_image_url = main_jpl_url + image_url
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA14924_ip.jpg'

## Mars Weather

In [14]:
# Visit the url for the Mars Weather Twitter Account using Splinter
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)

In [15]:
# Create BeautifulSoup object; parse with 'html'
weather_html = browser.html
weather_soup = BeautifulSoup(weather_html, 'html.parser')

In [16]:
# Examine the results, then determine element that contains sought info
print(weather_soup.prettify())

<html dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=0,viewport-fit=cover" name="viewport"/>
  <link href="//abs.twimg.com" rel="preconnect"/>
  <link href="//api.twitter.com" rel="preconnect"/>
  <link href="//pbs.twimg.com" rel="preconnect"/>
  <link href="//t.co" rel="preconnect"/>
  <link href="//video.twimg.com" rel="preconnect"/>
  <link href="//abs.twimg.com" rel="dns-prefetch"/>
  <link href="//api.twitter.com" rel="dns-prefetch"/>
  <link href="//pbs.twimg.com" rel="dns-prefetch"/>
  <link href="//t.co" rel="dns-prefetch"/>
  <link href="//video.twimg.com" rel="dns-prefetch"/>
  <link as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/web/polyfills.bab5fe74.js" nonce="" rel="preload"/>
  <link as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/web/vendors~main.e0482f54.js" nonce="" rel="preload"/>
  <link as="script" crossorigin="ano

In [17]:
# Examine the results, then determine element that contains sought info (Latest Tweet) by looping through Results

for x in range(1, 5):

    weather_html = browser.html
    weather_soup = BeautifulSoup(weather_html, 'html.parser')

    tweets = weather_soup.find_all('span', class_='css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0')

    for tweet in tweets:
        print('page:', x, '-------------------------------------------------------------------------------------------')
        print(tweet.text)

page: 1 -------------------------------------------------------------------------------------------
Log in
page: 1 -------------------------------------------------------------------------------------------
Sign up
page: 2 -------------------------------------------------------------------------------------------
Log in
page: 2 -------------------------------------------------------------------------------------------
Sign up
page: 3 -------------------------------------------------------------------------------------------
Log in
page: 3 -------------------------------------------------------------------------------------------
Sign up
page: 4 -------------------------------------------------------------------------------------------
Log in
page: 4 -------------------------------------------------------------------------------------------
Sign up


In [18]:
mars_weather = "InSight sol 498 (2020-04-21) low -94.3ºC (-137.7ºF) high -5.7ºC (21.8ºF) winds from the SW at 5.0 m/s (11.3 mph) gusting to 16.6 m/s (37.2 mph) pressure at 6.60 hPa"

## Mars Facts

In [19]:
# We can use the read_html function in Pandas to automatically scrape any tabular data from a page.
facts_url = 'https://space-facts.com/mars/'

tables = pd.read_html(facts_url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
                       0                              1
 0             Diameter:                       3,475 km
 1                 Mass:  7.35 × 10^22 kg (0.01 Earths)
 2               Orbits:                      The Earth
 3       Orbit Distance:                     384,400 km
 4         Orbit Period:                      27.3 days
 5  Surface Temperature:                 -233 to 123 °C,
   Mars - Earth Comparison             Mars   

In [20]:
# What we get in return is a list of dataframes for any tabular data that Pandas found.
type(tables)

list

In [23]:
# We can slice off any of those dataframes that we want using normal indexing.
df = tables[0]
df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers
