# STEP 1:  Web Scraping

In [1]:
# Dependencies
from bs4 import BeautifulSoup
from splinter import Browser
import requests
import pymongo
import pandas as pd
import time

In [2]:
# Windows Users
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
time.sleep(1)

## NASA Mars News

In [None]:
# Visit the url for Mars News using Splinter
news_url = 'https://mars.nasa.gov/news/'
browser.visit(news_url)
time.sleep(1)

In [None]:
# Create BeautifulSoup object; parse with 'html'
news_html = browser.html
news_soup = BeautifulSoup(news_html, 'html.parser')

In [None]:
# Print formatted version of the soup
#print(news_soup.prettify())

In [None]:
# Examine the results, then determine element that contains sought info (Latest News Title)
list_text = news_soup.find('div', class_='list_text')
print(list_text)
content_title = list_text.find('div', class_='content_title').find('a').text
print(content_title)
#news_title = content_title.find('a').text

# Print the scraped information
#news_title
#content_title

In [None]:
# Examine the results, then determine element that contains sought info (Latest News Paragraph)
news_p = list_text.find('div', class_='article_teaser_body').text
print(news_p)

## JPL Mars Space Images - Featured Image

In [8]:
# Visit the url for JPL Featured Space Image using Splinter
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url)
time.sleep(1)
#browser.click_link_by_partial_text('FULL IMAGE')
button = browser.links.find_by_partial_text('FULL IMAGE')
time.sleep(1)

In [9]:
# Create BeautifulSoup object; parse with 'html'
image_html = browser.html
image_soup = BeautifulSoup(image_html, 'html.parser')

In [10]:
# Print formatted version of the soup
#print(image_soup.prettify())

In [11]:
# Examine the results, then determine element that contains sought info (Latest Image)
footer = image_soup.find('a', class_="button fancybox")
print(footer)

<a class="button fancybox" data-description="This artist's concept shows the planet catalogued as 2003UB313 at the lonely outer fringes of our solar system. Our Sun can be seen in the distance. The new planet is at least as big as Pluto and about three times farther away from the Sun than Pluto." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA08003_ip.jpg" data-link="/spaceimages/details.php?id=PIA08003" data-title="Newest Member of Our Solar System &amp;#40;Artist's Concept&amp;#41;" id="full_image">
					FULL IMAGE
				  </a>


In [None]:
#dir(footer)

In [12]:
button.click()

In [13]:
full_image_html = browser.html
full_image_soup = BeautifulSoup(full_image_html, 'html.parser')

In [14]:
full_image = full_image_soup.find('img', class_='fancybox-image')
print(full_image)

<img class="fancybox-image" src="/spaceimages/images/mediumsize/PIA08003_ip.jpg" style="display: inline;"/>


In [22]:
image_url = full_image.attrs['src']

In [23]:
image_url

'/spaceimages/images/mediumsize/PIA08003_ip.jpg'

In [26]:
main_jpl_url = "https://www.jpl.nasa.gov"
final_url = f'{main_jpl_url}{image_url}'
print(final_url)

https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA08003_ip.jpg


## Mars Weather

In [None]:
# Visit the url for the Mars Weather Twitter Account using Splinter
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)

In [None]:
# Create BeautifulSoup object; parse with 'html'
weather_html = browser.html
weather_soup = BeautifulSoup(weather_html, 'html.parser')

In [None]:
# Print formatted version of the soup
#print(weather_soup.prettify())

In [None]:
# Examine the results, then determine element that contains sought info (Latest Tweet) by looping through Results

for x in range(1, 5):

    weather_html = browser.html
    weather_soup = BeautifulSoup(weather_html, 'html.parser')

    tweets = weather_soup.find_all('span', class_='css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0')

    for tweet in tweets:
        print('page:', x, '-------------------------------------------------------------------------------------------')
        print(tweet.text)

In [None]:
mars_weather = "InSight sol 498 (2020-04-21) low -94.3ºC (-137.7ºF) high -5.7ºC (21.8ºF) winds from the SW at 5.0 m/s (11.3 mph) gusting to 16.6 m/s (37.2 mph) pressure at 6.60 hPa"

## Mars Facts

In [None]:
# We can use the read_html function in Pandas to automatically scrape any tabular data from a page.
facts_url = 'https://space-facts.com/mars/'

tables = pd.read_html(facts_url)
tables

In [None]:
# What we get in return is a list of dataframes for any tabular data that Pandas found.
type(tables)

In [None]:
# We can slice off any of those dataframes that we want using normal indexing.
df = tables[0]
df

In [None]:
# Assign the columns `['Description', 'Value']`
df.columns = ['Description','Value']
df

In [None]:
# Set the index to the `Description` column without row indexing
df.set_index('Description', inplace=True)
df

In [None]:
# Save the table directly to a file.

df.to_html('table.html')

## Mars Hemispheres

In [None]:
# Visit the url for the USGS Astrogeology site using Splinter
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

In [None]:
# Create BeautifulSoup object; parse with 'html'
hemispheres_html = browser.html
hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser')

In [None]:
# Print formatted version of the soup
#print(hemispheres_soup.prettify())

In [None]:
# Examine the results, then determine element that contains sought info (Latest Tweet) by looping through Results

for y in range(1, 5):

    hemispheres_html = browser.html
    hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser')

    hemispheres = hemispheres_soup.find_all('a', class_='itemLink product-item')

    for hemisphere in hemispheres:
        print('page:', y, '-------------------------------------------------------------------------------------------')
        print(hemisphere.text)