In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
from splinter import Browser
import time
import pandas as pd
import requests
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Initialize Browser 

def init_browser():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    return Browser("chrome", **executable_path, headless=False)

In [3]:
# Scrape News Headline
def scrape_news():
    browser = init_browser()
    url = 'https://redplanetscience.com/'
    browser.visit(url)
    time.sleep(2)
    html = browser.html
    soup = bs(html, 'html.parser')
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text
    browser.quit()
    return [news_title, news_p]

#print(scrape_news())

In [4]:
# Scrape Featured Image

def scrape_image():
    browser = init_browser()
    url = 'https://spaceimages-mars.com'
    browser.visit(url)
    time.sleep(2)
    html = browser.html
    soup = bs(html, 'html.parser')
    relative_image_path = soup.find_all('img')[1]['src']
    featured_image_url = url + "/" + relative_image_path
    browser.quit()
    return(featured_image_url)

#print(scrape_image())

In [5]:
# Scrape Mars Facts Table

"""Visit the Mars Facts webpage [here](https://galaxyfacts-mars.com) 
and use Pandas to scrape the table containing facts about the planet 
including Diameter, Mass, etc."""

# Use Pandas to convert the data to a HTML table string.
def scrape_facts():
    url = 'https://galaxyfacts-mars.com'
    table_pandas = pd.read_html(url)[0]
    table_pandas = table_pandas.iloc[1:]
    table_pandas.columns = ['Description', 'Mars', 'Earth']
    table_pandas = table_pandas.set_index('Description')

    table_pandas = table_pandas.to_html()
    return table_pandas

#print(scrape_facts())

In [9]:
## Scrape Mars Hemisphere Images
# Step 1 - Get hemisphere titles containing the hemisphere name with Beautiful Soup
""" Step 2 - Get image url string for the full resolution 
# hemisphere images using Splinter"""

def scrape_hemispheres():
    url = 'https://marshemispheres.com/'
    response = requests.get(url)
    soup_1 = bs(response.text, 'html')
    browser = init_browser()
    list_urls = ['https://marshemispheres.com/cerberus.html', 'https://marshemispheres.com/schiaparelli.html', 
                 'https://marshemispheres.com/syrtis.html', 'https://marshemispheres.com/valles.html'] 
    
    results = soup_1.find_all('h3')

    title_list = []
    # For loop to get all the headers (hemispheres) 
    # dropped the 5th element called "Back"
    for result in results[:4]:
        # Grab the header text
        header = result.text
        # Remove the word "Enhanced"
        header_cleaned = header[:-9]
        title_list.append(header_cleaned)

    final_urls_list = []
    
    for url in list_urls:
        browser.visit(url)
        time.sleep(2)
        html = browser.html
        soup_2 = bs(html, 'html.parser')
        links = [a['href'] for a in soup_2.find_all('a', href=True)]
        image_url = links[3]
        print(image_url)
        full_url = url + "/" + image_url
        clean_url = url[:28]
        final_urls = clean_url + image_url
        final_urls_list.append(final_urls)

    hemisphere_image_urls = [
        {"title": title_list[0], "img_url": final_urls_list[0]},
        {"title": title_list[1], "img_url": final_urls_list[1]},
        {"title": title_list[2], "img_url": final_urls_list[2]},
        {"title": title_list[3], "img_url": final_urls_list[3]}
    ]

    browser.quit()
    return(hemisphere_image_urls)

print(scrape_hemispheres())



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Get LATEST driver version for 91.0.4472
Trying to download new driver from https://chromedriver.storage.googleapis.com/91.0.4472.101/chromedriver_mac64.zip
Driver has been saved in cache [/Users/amandapesch/.wdm/drivers/chromedriver/mac64/91.0.4472.101]


images/full.jpg
images/schiaparelli_enhanced-full.jpg
images/syrtis_major_enhanced-full.jpg
images/valles_marineris_enhanced-full.jpg
[{'title': 'Cerberus Hemisphere', 'img_url': 'https://marshemispheres.com/images/full.jpg'}, {'title': 'Schiaparelli Hemisphere', 'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'}, {'title': 'Syrtis Major Hemisphere', 'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'}, {'title': 'Valles Marineris Hemisphere', 'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'}]


In [7]:
# Bring everything together into one dictionary

mars_data = {}

def scrape():
    news = scrape_news()
    featured_image = scrape_image()
    facts = scrape_facts()
    hemispheres = scrape_hemispheres()
    
    
    
    mars_data['news'] = news
    mars_data['featured_image'] = featured_image
    mars_data['facts'] = facts
    mars_data['hemispheres'] = hemispheres
    
    return mars_data
#print(scrape())