In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import time
from splinter import Browser
import re

In [3]:
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=True)


def scrape_info(url):
    browser = init_browser()

    # Visit visitcostarica.herokuapp.com
    browser.visit(url)

    time.sleep(1)
    # Scrape without splinter
    # html = requests.get("https://mars.nasa.gov/news/").text
    # soup = bs.BeautifulSoup(html, 'html.parser')
    
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "lxml")
    
    browser.quit()
    return soup
    
def scrape_nasa_news():
    url = "https://mars.nasa.gov/news/"
    soup = scrape_info(url)
    news_title = soup.find('div', class_='content_title').text.strip()
    news_p = soup.find('div', class_='article_teaser_body').text.strip()
    return news_title, news_p

def scrape_nasa_spaceimages():
    browser = init_browser()
    browser.visit("https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")
    time.sleep(1)
    # Click's the link to produce a larger version of the featured image
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(1)
    soup = bs(browser.html, "lxml")
    soup.find()
    # finds the relative path to the image, and converts it to a full image path
    img_url = ("https://www.jpl.nasa.gov/spaceimages/" + 
               soup.find('img',class_="fancybox-image")['src'].split("/",2)[-1])
    browser.quit()
    return img_url    

def scrape_twitter():
    url = "https://twitter.com/marswxreport?lang=en"
    soup = scrape_info(url)
    # regular expression pattern to pass to soup's find
    pattern = re.compile(r'InSight sol')
    # find text in the html hat contains the starting message for the weather tweet
    last_tweet = soup.find_all(text=pattern)[0].replace('\n', ' ')
    return last_tweet

def scrape_table():
    html = requests.get("https://space-facts.com/mars/").content
    # the second table has mars's facts
    df = pd.read_html(html)[1]
    return df

def scrape_hemisphere():
    wayback = "https://web.archive.org/web/20181114171728/"
    #wayback = ""
    html = requests.get(wayback + 
                        "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars").content
    soup = bs(html, 'lxml')
    items = soup.find_all('div', class_='item')
    #urls = ["https://astrogeology.usgs.gov" + item.find('a')['href'] for item in items]
    urls = ["https://web.archive.org" + item.find('a')['href'] for item in items]
    print(urls)
    hemisphere_image_urls = []
    for path in urls:
        print(path)
        soup_hemisphere = bs(requests.get(path).content, 'lxml')
        title = soup_hemisphere.find('h2', class_='title').text
        img_url = soup_hemisphere.find('div', class_='downloads').find('a')['href']
        hemisphere_image_urls.append({"title": title, "img_url": img_url})
    return hemisphere_image_urls

def scrape():
    news_title, news_p = scrape_nasa_news()
    featured_img_url = scrape_nasa_spaceimages()
    last_tweet = scrape_twitter()
    facts_df = scrape_table()
    hemisphere_img_urls = scrape_hemisphere()
    results = {"news_title": news_title, "news_p": news_p,
     "featured_img_url": featured_img_url, "last_tweet": last_tweet,
     "facts_df": facts_df, "hemisphere_img_urls":hemisphere_img_urls}
    return results

In [4]:
scrape()

['https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']
https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced
https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced


{'news_title': 'Small Satellite Mission of the Year',
 'news_p': 'The first interplanetary CubeSats were recognized by the engineering community with the 2019 Small Satellite Mission of the Year award.',
 'featured_img_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA15256_ip.jpg',
 'last_tweet': 'InSight sol 250 (2019-08-10) low -100.0ºC (-148.1ºF) high -26.2ºC (-15.1ºF) winds from the SSE at 4.4 m/s (9.8 mph) gusting to 16.2 m/s (36.2 mph) pressure at 7.60 hPa',
 'facts_df':                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC