# Mission to Mars Data Scraping

## 1. Dependencies

In [1]:
import pandas as pd
from datetime import datetime as dt

import pymongo

from splinter import Browser
import time
from bs4 import BeautifulSoup as bs

## 2. Scrap Data
### 2.1 Launch Chrome Driver

In [2]:
# launch chromedriver -- get an empty page
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

### 2.2 Scrap `NASA Mars News Site` (https://mars.nasa.gov/news/)

In [3]:
url_mars_news = 'https://mars.nasa.gov/news/'
browser.visit(url_mars_news)
time.sleep(2)

In [4]:
# collect the latest News Title and Paragraph Text
soup = bs(browser.html, 'html.parser')

In [5]:
content = soup.body.find_all('li', class_='slide')

news = []

for cont in content:
    # find article_title
    title_div = cont.find('div', class_='content_title')
    weblink = 'https://mars.nasa.gov' + title_div.a['href']
    title = title_div.a.text.strip()
    
    # find article teaser (abstract)
    abstract_div = cont.find('div', class_='article_teaser_body')
    abstract = abstract_div.text
    
    # find article publish date
    list_date = cont.find('div', class_='list_date')
    date = dt.strptime(list_date.text, '%B %d, %Y')
    
    # built a dictionary
    dict = {"date": date, "title": title, "weblink": weblink, "abstract": abstract}
    
    news.append(dict)

In [6]:
# print an example from scraping
print("\nExample Scraped Content:")
print('---------------------------------')
print("\n{")
for key in news[0]:
    print("    ", key, ":", news[0][key])
print("}")   


Example Scraped Content:
---------------------------------

{
     date : 2019-08-05 00:00:00
     title : NASA 'Optometrists' Verify Mars 2020 Rover's 20/20 Vision
     weblink : https://mars.nasa.gov/news/8499/nasa-optometrists-verify-mars-2020-rovers-2020-vision/
     abstract : Mars 2020 rover underwent an eye exam after several cameras were installed on the rover.
}


### 2.3 Scrap `JPL Mars Space Feature Images` (https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)

In [7]:
# use the same browser (same tag) to visit another url
url_jpl_img = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url_jpl_img)
time.sleep(2)

In [8]:
# collect the Feature Image HTML
soup = bs(browser.html, 'html.parser')

In [9]:
content = soup.body.find('section', class_='primary_media_feature').find('article', class_="carousel_item")

img_href = content['style'].split("url('")[1].split("')")[0]
featured_image_url = 'https://www.jpl.nasa.gov' + img_href

In [10]:
# print an example from scraping
print("\nExample Scraped Content:")
print('---------------------------------')
print(featured_image_url) 


Example Scraped Content:
---------------------------------
https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA19180-1920x1200.jpg


### 2.4 Scrap `Mars Weather Twitter` (https://twitter.com/marswxreport?lang=en)

In [11]:
# use the same browser (same tag) to visit another url
url_mars_weather = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url_mars_weather)
time.sleep(2)

In [12]:
# collect the latest News Title and Paragraph Text
soup = bs(browser.html, 'html.parser')

In [13]:
tweets = soup.body.find_all('li', class_='stream-item')

In [14]:
weather_post_by_MarsWxReport = []


for tweet in tweets:
    post = tweet.find('div', class_='tweet')
    if post['data-screen-name'] == "MarsWxReport":
        post_text = post.find('p', class_='tweet-text').text
        if "InSight sol" in post_text:
            weather_post_by_MarsWxReport.append(post_text.split('hPa')[0] + 'hPa')

In [15]:
# print an example from scraping
print("\nExample Scraped Content:")
print('---------------------------------\n')

print(weather_post_by_MarsWxReport[0])


Example Scraped Content:
---------------------------------

InSight sol 245 (2019-08-05) low -99.9ºC (-147.8ºF) high -25.6ºC (-14.1ºF)
winds from the SSE at 4.6 m/s (10.2 mph) gusting to 17.7 m/s (39.5 mph)
pressure at 7.60 hPa


### 2.5 Scrap `Mars Fact` (https://space-facts.com/mars/)

In [16]:
# use the same browser (same tag) to visit another url
url_mars_fact = 'https://space-facts.com/mars/'
browser.visit(url_mars_fact)
time.sleep(2)

In [17]:
mars_fact = pd.read_html(browser.html)

# get the MARS FACT table
mars_fact_df = mars_fact[1]

for i in range(len(mars_fact_df)):
    mars_fact_df.iloc[i,0] = mars_fact_df.iloc[i,0][0:-1] # get rid of the ":"

mars_fact_df.columns = ["Profile", 'Value']
#mars_fact_df.set_index('Profile', inplace = True)

mars_fact_df

Unnamed: 0,Profile,Value
0,Equatorial Diameter,"6,792 km"
1,Polar Diameter,"6,752 km"
2,Mass,6.39 × 10^23 kg (0.11 Earths)
3,Moons,2 (Phobos & Deimos)
4,Orbit Distance,"227,943,824 km (1.38 AU)"
5,Orbit Period,687 days (1.9 years)
6,Surface Temperature,-87 to -5 °C
7,First Record,2nd millennium BC
8,Recorded By,Egyptian astronomers


In [33]:
# !open mars_fact.html
a = mars_fact_df.to_html()
a = a.replace("dataframe", "table table-striped table-hover")

file = open('mars_fact.html', 'w')
file.write(a)
file.close()

<table border="1" class="table table-striped table-hover">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Profile</th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Equatorial Diameter</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Polar Diameter</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Mass</td>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Moons</td>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Orbit Distance</td>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>5</th>
      <td>Orbit Period</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Surface Temperature</td>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>7</th>
      <td>First Record</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>8

### 2.6 Scrap `USGS Astrogeology site` (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)

In [20]:
# use the (same tag) to visit another url
url_mars_hemisphere = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_mars_hemisphere)
time.sleep(2)

In [21]:
soup = bs(browser.html, 'html.parser')

In [22]:
content_div = soup.body.find('div', class_='collapsible results').find_all('div',class_='item')

In [23]:
hemisphere_image_urls = []

for i in range(len(content_div)):    
    title = content_div[i].find('h3').text
    
    # click the image title
    browser.find_link_by_partial_text(title).first.click()
    soup_find_img = bs(browser.html, 'html.parser')
    img_url = soup_find_img.find('div', class_='downloads').find('a')['href']

    # add to dictionary
    img_dict = {"title": title, "img_url": img_url}
    hemisphere_image_urls.append(img_dict)
    
    # roll back to the previous img
    browser.back()

In [24]:
# print an example from scraping
print("\nExample Scraped Content:")
print('---------------------------------')
print("\n{")
for key in hemisphere_image_urls[0]:
    print("    ", key, ":", hemisphere_image_urls[0][key])
print("}")   


Example Scraped Content:
---------------------------------

{
     title : Cerberus Hemisphere Enhanced
     img_url : http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
}
