# Import Dependencies and the required modules

In [1]:
# Dependencies
import pandas as pd
import pdb

# Import BeautifulSoup
from bs4 import BeautifulSoup as bs
import requests

# Import Splinter and set the chromedriver path
from splinter import Browser
from selenium import webdriver

# Module used to connect Python with MongoDb
import pymongo

# NASA Mars News

In [2]:
# Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. 
# Assign the text to variables that you can reference later.

# URL of page to be scraped
url_nasa = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"

# executable path and browser
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)
browser.visit(url_nasa)

# Retrieve page with the requested module
#response = requests.get(url_nasa)

# Create Beatifulsoup object; pase with 'html.parser' or 'lxml'
soup = bs(browser.html, 'html.parser')

# Examine the results, then determine element that contains the info needed
#print(soup.prettify()) #Note: it was very useful to print the first time, afterwards it is impractical.

In [3]:
# Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. 
# Assign the text to variables that you can reference later.
# Results are returned as an iterable list
nasa_news_results = soup.find_all('li', class_='slide')  
#print(nasa_news_results)

In [4]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collection
db = client.nasaNews_db
collection = db.items

# Retrieve page with the requests module
response = requests.get(url_nasa)

In [5]:
# Loop through returned results
for result in nasa_news_results:
    try:
        nasa_news = result.find('div', class_= 'image_and_description_container')
        date = nasa_news.find('div', class_='list_date').text
        title = result.find('div', class_='content_title').text
        paragraph = nasa_news.find('div', class_='article_teaser_body').text
        
        if (date and title and paragraph):
            print('------------------------')
            print(date)
            print(title)
            print(paragraph)
            
             # Dictionary to be inserted as a MongoDB document
            post = {
                'date': date,
                'title': title,
                'paragraph': paragraph
            }

            collection.insert_one(post)
              
    except AttributeError as e:
        print(e)

------------------------
March  5, 2020
Virginia Middle School Student Earns Honor of Naming NASA's Next Mars Rover
NASA chose a seventh-grader from Virginia as winner of the agency's "Name the Rover" essay contest. Alexander Mather's entry for "Perseverance" was voted tops among 28,000 entries. 
------------------------
March  4, 2020
NASA's Curiosity Mars Rover Snaps Its Highest-Resolution Panorama Yet
To go along with the stunning 1.8-billion-pixel image, a new video offers a sweeping view of the Red Planet.
------------------------
March  3, 2020
NASA to Reveal Name of Its Next Mars Rover
After a months-long contest among students to name NASA's newest Mars rover, the agency will reveal the winning name — and the winning student — this Thursday. 
------------------------
February 27, 2020
The MarCO Mission Comes to an End
The pair of briefcase-sized satellites made history when they sailed past Mars in 2019.
------------------------
February 24, 2020
A Year of Surprising Science Fr

In [6]:
# Display items in MongoDB collection
listings = db.items.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('5e6dab6c9c1c6d1d199519cf'), 'date': 'March  5, 2020', 'title': "Virginia Middle School Student Earns Honor of Naming NASA's Next Mars Rover", 'paragraph': 'NASA chose a seventh-grader from Virginia as winner of the agency\'s "Name the Rover" essay contest. Alexander Mather\'s entry for "Perseverance" was voted tops among 28,000 entries. '}
{'_id': ObjectId('5e6dab6c9c1c6d1d199519d0'), 'date': 'March  4, 2020', 'title': "NASA's Curiosity Mars Rover Snaps Its Highest-Resolution Panorama Yet", 'paragraph': 'To go along with the stunning 1.8-billion-pixel image, a new video offers a sweeping view of the Red Planet.'}
{'_id': ObjectId('5e6dab6c9c1c6d1d199519d1'), 'date': 'March  3, 2020', 'title': 'NASA to Reveal Name of Its Next Mars Rover', 'paragraph': "After a months-long contest among students to name NASA's newest Mars rover, the agency will reveal the winning name — and the winning student — this Thursday. "}
{'_id': ObjectId('5e6dab6c9c1c6d1d199519d2'), 'date': 'Fe

In [7]:
# How can I expor this mongo collecto to CSV? (Not required for homework, but I'm interested in how to do it)
# Export MongoDB collection
#mongoexport --db nasaNews_db --collection items --type=csv --fields _id,date,title,paragraph --out "nasaNews.csv"

# JPL Mars Space Images - Featured Image

In [8]:
# Visit the url for JPL Featured Space Image to be scraped
url_mars_images = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

# executable path and browser
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

# Scrape site
browser.visit(url_mars_images)

# Create Beatifulsoup object; pase with 'html.parser' or 'lxml'
soup = bs(browser.html, 'html.parser')

In [9]:
# Image source
mars_image_source = soup.find(class_ = 'carousel_item')['style']

In [10]:
print(mars_image_source)

background-image: url('/spaceimages/images/wallpaper/PIA18058-1920x1200.jpg');


In [11]:
# I need to split the string on the url only
image_string = mars_image_source.split("'")[1]
print(image_string)

/spaceimages/images/wallpaper/PIA18058-1920x1200.jpg


In [12]:
# Featured Mars Image and assign the url string to a variable called featured_image_url.
# I need the base url
base_url = "https://jpl.nasa.gov"

# the string URL I got in the previous step. I'll use the variable named 
# indicated in the homework's instructions. 
featured_image_url = base_url + image_string

# Make sure to save a complete url string for this image.
print(featured_image_url)

https://jpl.nasa.gov/spaceimages/images/wallpaper/PIA18058-1920x1200.jpg


In [13]:
# How can I save this image? (Not required for homework, but I'm interested in how to do it)
#feature_image_url.save_as.jpg

# Mars Facts

In [14]:
# Visit the Mars Facts webpage 
url_mars_facts = 'https://space-facts.com/mars/'
#executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
#browser = Browser("chrome", **executable_path, headless=False)
#browser.visit(url_mars_facts)
#soup = bs(browser.html, 'html.parser')

In [15]:
#Use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
#to scrape using pandas and nothing else, I have to use the following. Please note that
# the [1] was added b/c the table has two elements and without it, it marked an error
mars_table_df = pd.read_html(url_mars_facts)[1] 
mars_table_df.columns = ['', 'Value']
mars_table_df

Unnamed: 0,Unnamed: 1,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [16]:
# Use Pandas to convert the data to a HTML table string.
mars_table_string = pd.DataFrame(mars_table_df)
print(mars_table_string.to_string())

                                                 Value
0  Equatorial Diameter:                       6,792 km
1       Polar Diameter:                       6,752 km
2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
3                Moons:            2 (Phobos & Deimos)
4       Orbit Distance:       227,943,824 km (1.38 AU)
5         Orbit Period:           687 days (1.9 years)
6  Surface Temperature:                   -87 to -5 °C
7         First Record:              2nd millennium BC
8          Recorded By:           Egyptian astronomers


# Mars Hemispheres

In [17]:
# Visit the USGS Astrogeology site here to obtain high 
# resolution images for each of Mar's hemispheres.
# Visit the url for JPL Featured Space Image to be scraped
url_mars_hemispheres_base = 'https://astrogeology.usgs.gov' 
url_mars_hemispheres_complete = url_mars_hemispheres_base + "/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)
browser.visit(url_mars_hemispheres_complete)
soup = bs(browser.html, 'html.parser')
#print(soup.prettify())

In [18]:
# Create a list to hold the urls
hemisphere_image_urls = []

# You will need to click each of the links to the hemispheres in order to find the image url 
# to the full resolution image.
# Find step by step and append each time to the list.
hemi_first_step = soup.find('div', id='product-section')
hemi_second_step = hemi_first_step.find_all('div', class_='item')
for item in hemi_second_step:
    # Save the Hemisphere title containing the hemisphere name. 
    title = item.find('h3').text 
    new_link = item.find('a')['href']
    browser.visit(url_mars_hemispheres_base + new_link)
    soup = bs(browser.html, 'html.parser')
    downloads = soup.find('div', class_='downloads')
    # Save both the image url string for the full resolution hemisphere image,
    url = downloads.find('a')['href'] 
    # Use a Python dictionary to store the data using the keys img_url and title.
    #Append the dictionary with the image url string and the hemisphere title to a list.
    hemisphere_image_urls.append({'img_url': url, 'title': title})

# This list will contain one dictionary for each hemisphere.
# Print the list
hemisphere_image_urls

[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

# Mars Weather

In [None]:
# Visit and scrape the Mars Weather twitter account. Find the latest Mars weather tweets from the page. 
url_marsTwitter = "https://twitter.com/marswxreport?lang=en"
#executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
#browser = Browser("chrome", **executable_path, headless=False)
browser.visit(url_marsTwitter)
#twitter_soup = bs(browser.html, 'html.parser')
#print(twitter_soup.prettify())

In [None]:
#SCRAP to get the requested info
#weather_results = twitter_soup.body.div.div.div.main.div.div.div.div.div.div.div
#print(weather_results)
#print(mars_weather[3])

# class is "TweetTextSize"
# using Splinter within a list comprehension to strip the tweet down to its text; we want the top 
# tweet, so that's index 0
#mars_weather = [tweet.text.strip() for tweet in soup.find_by_css(".TweetTextSize")]
#tweet = soup('span',class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0").text 



#tweet = soup.find("#react-root > div > div > div.css-1dbjc4n.r-1pi2tsx.r-13qz1uu.r-417010 > main > div > div > div > div > div > div > div > div > div:nth-child(3) > section > div > div > div > div:nth-child(4) > div > article > div > div > div:nth-child(2) > div.css-1dbjc4n.r-1iusvr4.r-16y2uox.r-1777fci.r-5f2r5o.r-1mi0q7o > div:nth-child(2) > div:nth-child(1) > div > span")
# Printing results
#print(tweet)

# Save the tweet text for the weather report as a variable called mars_weather.
# Change browser to twitter

#mars_weather = [tweet.text.strip() for tweet in soup.find_by_css(".TweetTextSize")]

#mars_weather = [tweet['@MarsWxReport'] for tweet in soup if tweet['@MarsWxReport'][0]]

# Step 2 - MongoDB and Flask Application

In [None]:
# Use MongoDB with Flask templating to create a new HTML page that displays all of the information 
# that was scraped from the URLs above.
# Start by converting your Jupyter notebook into a Python script called scrape_mars.py 
# with a function called scrape that will execute all of your scraping code from above 
# and return one Python dictionary containing all of the scraped data.
# Next, create a route called /scrape that will import your scrape_mars.py script 
# and call your scrape function.
# Store the return value in Mongo as a Python dictionary.
# Create a root route / that will query your Mongo database and pass the mars data into an HTML 
# template to display the data.
# Create a template HTML file called index.html 
# that will take the mars data dictionary and display all of the data in the 
# appropriate HTML elements. 
# Use the following as a guide for what the final product should look like, 
# but feel free to create your own design.