In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup

In [2]:
import pandas as pd
import requests

In [3]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path)

In [4]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)
#optional delay is useful because sometimes dynamic pages take a little while to load, especially if they are image-heavy

True

In [5]:
html = browser.html
news_soup = BeautifulSoup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [6]:
slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8682/the-extraordinary-sample-gathering-system-of-nasas-perseverance-mars-rover/" target="_self">The Extraordinary Sample-Gathering System of NASA's Perseverance Mars Rover</a></div>

In [7]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()

In [8]:
news_title

"The Extraordinary Sample-Gathering System of NASA's Perseverance Mars Rover"

In [9]:
# if we were to use .find_all() instead of .find() when pulling the summary, 
# we would retrieve all of the summaries on the page instead of just the first one.
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

'Two astronauts collected Moon rocks on Apollo 11. It will take three robotic systems working together to gather up the first Mars rock samples for return to Earth.'

"### Featured Images"

In [10]:
# Visit URL
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url)

In [12]:
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [13]:
# Find the more info button and click that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.find_link_by_partial_text('more info')
more_info_elem.click()
# Use browser.links.find_by_partial_text instead



In [14]:
# Parse the resulting html with soup
html = browser.html
img_soup = BeautifulSoup(html, 'html.parser')

In [15]:
# Find the relative image url
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA16565_hires.jpg'

In [16]:
# Use the base URL to create an absolute URL
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16565_hires.jpg'

In [4]:
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [8]:
try:
    # use 'read_html" to scrape the facts table into a dataframe
    df = pd.read_html('http://space-facts.com/mars/')[0]

except BaseException:
    df=None
    
    
print(df)


                      0                              1
0  Equatorial Diameter:                       6,792 km
1       Polar Diameter:                       6,752 km
2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
3                Moons:            2 (Phobos & Deimos)
4       Orbit Distance:       227,943,824 km (1.38 AU)
5         Orbit Period:           687 days (1.9 years)
6  Surface Temperature:                   -87 to -5 °C
7         First Record:              2nd millennium BC
8          Recorded By:           Egyptian astronomers


In [22]:
# pd.read_html scrapes entire table. automatically parse the table.[0] says to pull the first table it sees.
# pd can covert DF back to html-ready code
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [54]:
# Visit hemispheres image URL
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

In [55]:
# # Parse the resulting html with soup
html = browser.html
HR_img_soup = BeautifulSoup(html, 'html.parser')

In [56]:
# Find and click the full image hemisphere link
HR_image_elem = browser.links.find_by_partial_text('Hemisphere')
HR_image_elem.click()

In [57]:
# Find the relative image url
soup1 = HR_img_soup.find('div','downloads')
HR_img_url_rel = soup1.a['href']
HR_img_url_rel



AttributeError: 'NoneType' object has no attribute 'a'

In [42]:
# Find the more info button and click that
browser.links.find_by_partial_text('Sample')
HR_sample_elem = browser.links.find_by_partial_text('Sample')
HR_sample_elem.click()
# Use browser.links.find_by_partial_text instead

In [19]:
# Use the base URL to create an absolute URL
cerberus_url = f'https://astropedia.astrogeology.usgs.gov{HR_img_url_rel}'
cerberus_url

'https://astropedia.astrogeology.usgs.gov//images/usgs_logo_main_2x.png'

In [66]:
# Visit hemispheres image URL
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

In [67]:
# Find and click the full image hemisphere link
HR2_image_elem = browser.links.find_by_partial_text('Schiaparelli Hemisphere')

HR2_image_elem.click()

In [68]:
x= browser.find_link_by_text('Sample').first
image = x['href']



In [69]:
image

'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'

In [34]:
# Find the more info button and click that
browser.links.find_by_partial_text('Sample')
HR2_sample_elem = browser.links.find_by_partial_text('Sample')
HR2_sample_elem.first.click()
# Use browser.links.find_by_partial_text instead

In [35]:
# Parse the resulting html with soup
html = browser.html
HR2_img_soup = BeautifulSoup(html, 'html.parser')

In [36]:
# Find the relative image url
HR2_img_url_rel = HR2_img_soup.select_one('img').get("src")
HR2_img_url_rel

'/images/usgs_logo_main_2x.png'

In [37]:
# Use the base URL to create an absolute URL
schiaparelli_url = f'https://astropedia.astrogeology.usgs.gov{HR2_img_url_rel}'
schiaparelli_url

'https://astropedia.astrogeology.usgs.gov//images/usgs_logo_main_2x.png'

In [69]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser').find_all("a",class_ = "itemLink product-item")
hemi_titles = []
for i in soup:
    title = i.find("h3").text
    link= i["href"]
    # or i.a["href"]
    hemi_titles.append(title)
print(hemi_titles)

['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']


In [70]:
# Visit hemispheres image URL
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

In [62]:
image_dict={}
new_dict={}
# dict.items()
for x in range(len(hemi_titles)):
    
    try:
        HR_image_elem = browser.click_link_by_partial_text(hemi_titles[x])
        HR_page = browser.links.find_by_text('Sample').first
        image_dict['image_url'] = HR_page['href']
        image_dict['title'] = hemi_titles[x]
        
    except:
        browser.links.find_by_text("2").first.click()
        HR_page = browser.links.find_by_text('Sample').first
        image_dict['image_url'] = HR_page['href']
        image_dict['title'] = hemi_titles[x]
        
        try:
            HR_image_elem = browser.click_link_by_partial_text(hemi_titles[x])
            HR_page = browser.links.find_by_text('Sample').first
            image_dict['image_url'] = HR_page['href']
            image_dict['title'] = hemi_titles[x]
            
        except:
            browser.links.find_by_text("2").first.click()
            HR_page = browser.links.find_by_text('Sample').first
            image_dict['image_url'] = HR_page['href']
            image_dict['title'] = hemi_titles[x]
            
    
    image_dict.items()
    for key, value in image_dict.items():
        new_dict.update({key:value})
        print(key,value)

    

image_url https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
title Cerberus Hemisphere Enhanced
image_url https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
title Schiaparelli Hemisphere Enhanced
image_url https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
title Syrtis Major Hemisphere Enhanced
image_url https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
title Valles Marineris Hemisphere Enhanced


In [120]:
# image_dict.items()

dict_items([('image_url', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'), ('title', 'Valles Marineris Hemisphere Enhanced')])

In [125]:
# print(new_dict.items())

dict_items([('image_url', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'), ('title', 'Valles Marineris Hemisphere Enhanced')])


In [8]:
hemisphere_info = pd.DataFrame(columns=["title", "image_url"])

In [7]:


url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser').find_all("a",class_ = "itemLink product-item")
hemi_titles = []
for i in soup:
    title = i.find("h3").text
    link= i["href"]
    # or i.a["href"]
    hemi_titles.append(title)
print(hemi_titles)
# hemi_title_list=[]
# Visit hemispheres image URL
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)
mars_hemis=[]

for x in range(len(hemi_titles)):
    title = i.find("h3").text
    link= i["href"]
    try:
        browser.click_link_by_partial_text(hemi_titles[x])
        
    except:
        browser.find_link_by_text('2').first.click()
        browser.click_link_by_partial_text(hemi_titles[x])
        

    
    html = browser.html
    soup2 = BeautifulSoup(html, 'html.parser')
    hemi_soup = soup2.find('div', 'downloads')
    hemi_url = hemi_soup.a['href']
    
#     hemi_dict={"title": hemi_titles[x], 'img_url': hemi_url}
#     hemisphere_info = hemisphere_info.append({'title' : hemi_titles, 'image_url' : hemi_url}, ignore_index=True)
    mars_hemis.append({'title':title,'img_url':hemi_url})


['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']


In [18]:


# Visit hemispheres image URL
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)

soup = BeautifulSoup(html, 'html.parser')

# start of making list into dict.
mars_hemis=[]

# loop through hemipheres
hemispheres = soup.find_all("div", class_="description")

for x in hemispheres:
    title = i.find("h3").text
    link= i["href"]
    try:
        browser.click_link_by_partial_text(title)
        
    except:
        browser.find_link_by_text('2').first.click()
        browser.click_link_by_partial_text(title)
        

    
    html = browser.html
    soup2 = BeautifulSoup(html, 'html.parser')
    hemi_soup = soup2.find('div', 'downloads')
    hemi_url = hemi_soup.a['href']
    
#     hemi_dict={"title": hemi_titles[x], 'img_url': hemi_url}
#     hemisphere_info = hemisphere_info.append({'title' : hemi_titles, 'image_url' : hemi_url}, ignore_index=True)
    mars_hemis.append({'title':title,'img_url':hemi_url})

ElementDoesNotExist: no elements could be found with link by partial text "Valles Marineris Hemisphere Enhanced"

In [8]:
title

'Valles Marineris Hemisphere Enhanced'

In [14]:
mars_hemis[1]['img_url']


'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'

In [18]:
hemisphere_info

Unnamed: 0,title,image_url
0,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",[https://astropedia.astrogeology.usgs.gov/down...
1,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",[https://astropedia.astrogeology.usgs.gov/down...
2,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",[https://astropedia.astrogeology.usgs.gov/down...
3,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",[https://astropedia.astrogeology.usgs.gov/down...
4,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",https://astropedia.astrogeology.usgs.gov/downl...
5,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",https://astropedia.astrogeology.usgs.gov/downl...
6,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",https://astropedia.astrogeology.usgs.gov/downl...
7,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",https://astropedia.astrogeology.usgs.gov/downl...


In [21]:
img_1=hemisphere_info['image_url'][]
img_1


KeyError: 9

In [39]:
hemi_titles[0]

'Cerberus Hemisphere Enhanced'

In [49]:
# res = {test_keys[i]: test_values[i] for i in range(len(test_keys))} 
new_dict = {hemi_titles[i]:hemi_url_list[i] for i in range(len(hemi_titles))}

In [55]:
# res = {test_keys[i]: test_values[i] for i in range(len(test_keys))} 
new_dict2 = {'title':hemi_titles,'img_url':hemi_url_list}#for i in range(len(hemi_titles))}
print(new_dict2)

{'title': ['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced'], 'img_url': ['https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg', 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg']}


In [50]:
new_dict

{'Cerberus Hemisphere Enhanced': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
 'Schiaparelli Hemisphere Enhanced': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
 'Syrtis Major Hemisphere Enhanced': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
 'Valles Marineris Hemisphere Enhanced': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}

In [61]:
new_dict['Cerberus Hemisphere Enhanced']

'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'

In [63]:
new_dict2['title'][2]

'Syrtis Major Hemisphere Enhanced'

In [53]:
hemi_dict

{'title': 'Valles Marineris Hemisphere Enhanced',
 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}

In [81]:
test=[]
d={}
for i in hemi_titles:
    d['title']=i
    test.append(d.copy())
    print(d)


{'title': 'Cerberus Hemisphere Enhanced'}
{'title': 'Schiaparelli Hemisphere Enhanced'}
{'title': 'Syrtis Major Hemisphere Enhanced'}
{'title': 'Valles Marineris Hemisphere Enhanced'}


In [83]:
print(d)

{'title': 'Valles Marineris Hemisphere Enhanced'}


In [44]:
test2=[]
c={}
for i in hemi_url_list:
    c['img_url']=i
    test2.append(c.copy())
    print(c)

{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}
{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}
{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}
{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}


In [48]:
test2[0]

{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}

In [42]:
img2=test2[1]
img2

{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}

In [44]:
hemisphere_info = pd.DataFrame(columns=["title", "image_url"])

In [47]:
 hemisphere_info = hemisphere_info.append({'title' : hemi_titles[x], 'image_url' : hemi_url_list}, ignore_index=True)

In [32]:
hemi_titles[3]

'Valles Marineris Hemisphere Enhanced'

In [48]:
hemisphere_info

Unnamed: 0,title,image_url
0,"[Cerberus Hemisphere Enhanced, Schiaparelli He...",[https://astropedia.astrogeology.usgs.gov/down...
1,Valles Marineris Hemisphere Enhanced,[https://astropedia.astrogeology.usgs.gov/down...


In [None]:
hemi_dict3{
    "url1":
}

In [126]:
browser.quit()