# Web Scraping Homework 

## Step 1 - Scraping

In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests

### NASA Mars News
Scrape the https://mars.nasa.gov/news/ website and collect the latest News Titles and Paragraph Texts.

In [2]:
# Read HTML from file
url = 'https://mars.nasa.gov/news/'

In [3]:
# Retrieve page with the requests module
response = requests.get(url)

In [4]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(response.text, 'html.parser')

In [5]:
# print(soup.prettify())

In [6]:
# Print all article descriptions
descriptions = []
article_descriptions = soup.find_all('div', class_="rollover_description_inner")
for description in article_descriptions:
    descriptions.append(description.text)
#     print(description.text)
descriptions

["\nNASA's Opportunity Mars rover mission is complete after 15 years on Mars. Opportunity's record-breaking exploration laid the groundwork for future missions to the Red Planet.\n",
 '\nIn deploying its first instrument onto the surface of Mars, the lander completes a major mission milestone.\n',
 '\nAfter a five-year search, NASA has chosen Jezero Crater as the landing site for its upcoming Mars 2020 rover mission.\n',
 "\nIt's the beginning of the end for the planet-encircling dust storm on Mars. But it could still be weeks, or even months, before skies are clear enough for NASA's Opportunity rover to recharge its batteries and phone home. \n",
 '\nNASA’s Curiosity rover has found evidence on Mars with implications for NASA’s search for life.\n',
 '\nNASA is investing in technology concepts, including several from JPL, that may one day be used for future space exploration missions.\n']

In [7]:
# print all article titles
titles = []
article_titles = soup.find_all('div', class_='content_title')
for title in article_titles:
    titles.append(title.text)
#     print(title.text)
titles

["\n\nNASA's Opportunity Rover Mission on Mars Comes to End\n\n",
 "\n\nNASA's InSight Places First Instrument on Mars\n\n",
 '\n\nNASA Announces Landing Site for Mars 2020 Rover\n\n',
 '\n\nOpportunity Hunkers Down During Dust Storm\n\n',
 '\n\nNASA Finds Ancient Organic Material, Mysterious Methane on Mars\n\n',
 '\n\nNASA Invests in Visionary Technology \n\n']

### JPL Mars Space Images - Featured Image

In [2]:
# Import splinter dependency
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist

In [39]:
# Create the splinter browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [10]:
# Give the browser a site to visit
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [11]:
# Get the html from the site and parse it (get back a list)
html = browser.html
soup = bs(html, 'html.parser')

In [12]:
# View the html in prettified form
# print(soup.prettify())

In [13]:
button_fancybox = soup.find('a', class_='button fancybox')

In [14]:
browser.click_link_by_partial_text('FULL IMAGE')

In [15]:
browser.click_link_by_partial_text('more info')

In [16]:
featured_image_html = browser.html

### Mars Weather

In [17]:
# Give the browser a site to visit
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)

In [18]:
# Get the html from the site and parse it
html = browser.html
soup = bs(html, 'html.parser')

In [19]:
tweets = soup.find_all('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")[1].text
tweets
# temp=soup3.find_all('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')[1].text

'InSight sol 80 (2019-02-16), high -16/3F, low -95/-139F, pressure at 7.23hPa, winds from the WNW at 10.7 mph gusting to 32.3 mph'

### Mars Facts

In [20]:
# Get pandas dependecies
import pandas as pd

In [21]:
# Set the url to get space facts
url = "http://space-facts.com/mars/"

In [22]:
# scrape the table data from the site
tables = pd.read_html(url)
len(tables)

1

In [23]:
# Creat a dataframe out of this?
df = tables[0]
df = df.set_index([0, 1])
df.index.names = ['Category', 'Value']
df

Category,Value
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [24]:
df.to_html('mars_info.html')

### Mars Hemispheres

In [64]:
# Add selenium dependency
import selenium

In [40]:
# Give the browser a site to visit
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [41]:
# Get the html from the site and parse it
html = browser.html
soup = bs(html, 'html.parser')

In [42]:
hemi_pics = []

for n in range(4):
    item = soup.find_all('div', class_="item")[n]
    hemi_pics.append(item)
    
hemi_pics

[<div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png"/></a><div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div> <!-- end description --></div>,
 <div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/7677c0a006b83871b5a2f66985ab5857_schiapa

In [43]:
item = soup.find_all('div', class_="item")
description = item.find('div', class_='description')
anchor = description.find('a', class_='itemLink product-item')
hemi_name = anchor.find('h3').text
browser.click_link_by_partial_text(hemi_name)

# hemi_pics = []

# for n in range(4):
#     item = soup.find_all('div', class_="item")[n]
#     description = item.find_all('a')
    
#     hemi_pics.append(item)
    
# hemi_pics

In [65]:
# Get the html from the site and parse it
html = browser.html
soup = bs(html, 'html.parser')

downloads = soup.find_all('div', class_="downloads")
link_text = downloads[0].find('a').text
browser.click_link_by_text(link_text)
browser.send_keys()
browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)

# image_url = browser.url
# image_url

AttributeError: 'WebDriver' object has no attribute 'send_keys'

In [35]:
item_page_html = browser.html
item_page = bs(item_page_html, 'html.parser')
download = item_page.find('div', class_='downloads')
download.find('a')['href'
download
# item_page

<div class="downloads">
<img class="thumb" src="/cache/images/04085d99ec3713883a9a57f42be9c725_valles_marineris_enhanced.tif_thumb.png"/>
<h3>Download</h3>
<ul>
<li><a href="http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg" target="_blank">Sample</a> (jpg) 1024px wide</li>
<li><a href="http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif" target="_blank">Original</a> (tif<span class="tooltip word-tif" title=""></span>) 27 MB</li>
</ul>
</div>