In [2]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
from splinter import Browser
import time

##Set up the chromedriver browser
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser("chrome", **executable_path, headless=False)

## Nasa.gov News

In [2]:

##Requests.get doesn't seem to return the latest articles, they may be loaded by Javascript
##Chromedriver will be used instead.
browser.visit('https://mars.nasa.gov/news/') #send it to the URL
time.sleep(3) #Wait for loading
mnews_html = browser.html #get the HTML
nasa_soup = bs(mnews_html, "html.parser") #Parse the HTML into soup

In [3]:
## Each article is in a li with class 'slide', we only need the first one.
latestnews=nasa_soup.find('li', class_='slide')
news_title=latestnews.find('div', class_='content_title').get_text()
news_text=latestnews.find('div', class_='article_teaser_body').get_text()
print(f'{news_title}\n{news_text}')

NASA to Host Media Call on Next Mars Landing Site
NASA will host a media teleconference at 9 a.m. PST (noon EST) Monday, Nov. 19, to provide details about the Mars 2020 rover’s landing site on the Red Planet.


## JPL Images

In [3]:
#We can get the data from the JPL space images site with requests.get but the directions ask for splinter/chromedriver
jpl_url = 'https://www.jpl.nasa.gov'
jpl_search = '/spaceimages/?search=&category=Mars'
browser.visit(f'{jpl_url}{jpl_search}')
time.sleep(3)
jpl_html = browser.html
jpl_soup = bs(jpl_html, "html.parser")


In [4]:
#The featued image is in a section with class 'main_feature'
#The URL is listed in the article tag's style key, in the format:
#"background-image: url('/spaceimages/images/wallpaper/PIA19964-1920x1200.jpg');"
img_style = jpl_soup.find('section', class_='main_feature').article['style']

img_loc = img_style.find('url') #runs through the text string to find the character position where 'url' is listed
img_loc += len("url('") #skip past the part of the string that says url('
img_end = len(img_style) - len("');") #stop before the part of the string that says ');
img_url = jpl_url + img_style[img_loc:img_end] 
#img_style[img_loc:img_end] slices out the URL part of the string based on character positions found in the last two lines - then we append the JPL base URL, since the HTML lists a relative URL

img_title = jpl_soup.find('section', class_='main_feature').article['alt'] #gets the alt text, which appears to always be the title of the image

print(f'{img_title}\n{img_url}')

Dwarf Galaxy Spotted
https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA16613-1920x1200.jpg


## Weather

In [11]:
weather_html = requests.get('https://twitter.com/marswxreport?lang=en') 
#twitter lets us get the html through requests.get() so let's do that

<Response [200]>

In [10]:
twit_soup = bs(weather_html.text, "html.parser") #pass HTML to soup
weather=twit_soup.find('div', class_='js-tweet-text-container').get_text() #This will get only the latest tweet
weather=weather.replace('\n','') #strip out newline characters
weather

'Sol 2229 (2018-11-13), high -2C/28F, low -71C/-95F, pressure at 8.62 hPa, daylight 06:22-18:39'

## Space-facts.com table

In [6]:
#Only one table is returned
mfacts_df = pd.read_html('https://space-facts.com/mars/')[0]


In [8]:
mfacts_df.rename(columns={'0':'Description'})

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [13]:
table_html = mfacts_df.to_html(index=False, header=False)
table_html = str(table_html).replace('\n','') #For unclear reasons, strip declined to remove the \n tags, the replace method was substituted.
table_html

'<table border="1" class="dataframe">  <tbody>    <tr>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <td>Mass:</td>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>Surface Temperature:</td>      <td>-153 to 20 °C</td>    </tr>    <tr>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

   ## USGS Astrogeology

In [22]:
root_url='https://astrogeology.usgs.gov'
browser.visit(root_url + '/search/results?q=hemisphere+enhanced&k1=target&v1=Mars')
time.sleep(3)
usgs_results_html = browser.html
usgs_results_soup = bs(usgs_results_html, "html.parser")

In [23]:
def usgs_findlink(html):
    """Takes HTML from https://astrogeology.usgs.gov/ hemisphere search results"""
    """returns a dictionary with {title:<title>, img_url:<url>"""
    lsoup = bs(html, "html.parser")
    dl_links = lsoup.find('div', class_='downloads')  #Gets everything in the 'downloads' div
    for item in dl_links.find_all('li'):  #loops through the links looking for the 'sample' - my browser wouldn't load the original because of 'tif' format
        if (item.a.get_text()) == 'Sample':
            img_url = item.a['href']
            break
    lastchar = lsoup.title.text.find(' Enhanced') #Finds the char number of the space before the word enhanced - 
    #which comes after the hemisphere name in the HTML title
    title = lsoup.title.text[:lastchar] #slices the HTML title up to that character number
    out_dict = {"title": title, "img_url": img_url}
    return out_dict

In [24]:
hemisphere_image_urls = [] #This will contain the dictionaries requested by the assignment
linklist = [] #this will contain relative path URLs that will be extracted from the soup
astrogeo_items = usgs_results_soup.find_all('div', class_='item')

#Gather up the addresses we need to scrape
for item in astrogeo_items: 
    linklist.append(item.a['href'])

#Loop through and get the data from each one, using the function we just made
for link in linklist:
    browser.visit(root_url + link)
    time.sleep(4)
    lhtml = browser.html
    hemisphere_image_urls.append(usgs_findlink(lhtml))
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
