In [2]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [C:\Users\dhanu\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


# Article Scraping

In [73]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay
browser.is_element_present_by_css('dev.list_text',wait_time=1)

False

In [74]:
# Setting up the parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [75]:
# Extract the title
news_title=slide_elem.find('div',class_="content_title").get_text()
news_title

"Mars InSight Lander to Push on Top of the 'Mole'"

In [76]:
# Extract the summary
news_summary=slide_elem.find('div',class_="article_teaser_body").get_text()
news_summary

'Engineers have a plan for pushing down on the heat probe, which has been stuck at the Martian surface for a year.'

# Image Scraping

In [77]:
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [78]:
full_image_tag=browser.find_by_tag('button')[1]
full_image_tag.click()

In [79]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [80]:
# Extract the Rel URL
img_rel_url=img_soup.find('img',class_="fancybox-image")#.get('src')
img_rel_url

<img alt="" class="fancybox-image" src="image/featured/mars2.jpg"/>

In [81]:
# Use base URL to create absolute URL
abs_url=f"https://spaceimages-mars.com/{img_rel_url}"
abs_url

'https://spaceimages-mars.com/<img alt="" class="fancybox-image" src="image/featured/mars2.jpg"/>'

# Fact scraping

In [3]:
# Read HTML page to retrieve first table
df=pd.read_html("https://galaxyfacts-mars.com/")[0]
df.columns=["Description","Mars","Earth"]
df.set_index("Description", inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [7]:
# Convert df to HTML code to add to webpage
df.to_html().replace('class="dataframe"','class="table"')

'<table border="1" class="table">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>\n</

# D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

### Hemispheres

In [3]:
# 1. Use browser to visit the URL 
url = 'https://marshemispheres.com/'
browser.visit(url)

In [5]:
# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.

# Parse the code
multi_img_soup=soup(browser.html,'html.parser')

# Get list of first URLs
img_group=multi_img_soup.find("div", class_="collapsible results")
img_list=img_group.find_all("div",class_="description")

# Loop through list
for item in img_list:
    img_url=item.find("a").get("href")
    browser.links.find_by_partial_href(img_url)[1].click()
    ind_img_soup=soup(browser.html,'html.parser')
    img_url=ind_img_soup.find("div",class_="downloads").find("a",text="Sample").get("href")
    hemisphere_image_urls.append({"img_url":f"{url}{img_url}","title":item.find("h3").text})
    browser.back()

In [6]:
# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [88]:
# 5. Quit the browser
browser.quit()