# Web Scraping Challenge

### By: Carlos Casio

In [1]:
# Import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
import requests
import pandas as pd
import time

In [2]:
# Executing empty browser with ChromeDriveManager
executable_path = {"executable_path": ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless=False)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280


 


[WDM] - Driver [C:\Users\Carlo\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache


In [3]:
# Creating the dictionary to be used to store everything
main_dict = {}

## Nasa Mars News

In [4]:
# Visiting the first site and extracting the html code for the webpage. 
# The site was loading too fast so I added a match check for a list element, since all the news are listed on these classes.
# After it loads, we can continue
paragraph = ""
# Repeating process for the paragraph string because the first time it fails in load the complete code
while paragraph == "":
    browser.visit("https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest")
    browser.is_element_present_by_tag('LI', wait_time=10)
    html = browser.html
    soup = bs(html, "html.parser")

    # Storing First image, header and brief from the first News element.
    # We get the second [1] element for the header because the first one with the same class is not related to any News
    try:
        Header = soup.find_all("div",class_="content_title")[1].text.strip()
        paragraph =  soup.find("div",class_="article_teaser_body").text.strip()
        # For the image to work, the prefix is needed so we can join both strings
        mainpage = "mars.nasa.gov"
        img = mainpage + soup.find("div",class_="list_image").find("img")["src"]
    except:
        paragraph = ""

In [5]:
main_dict["news_header"] = Header
Header

'MOXIE Could Help Future Rockets Launch Off Mars'

In [6]:
main_dict["news_p"] = paragraph
paragraph

"NASA's Perseverance rover carries a device to convert Martian air into oxygen that, if produced on a larger scale, could be used not just for breathing, but also for fuel."

In [7]:
main_dict["news_img"] = img
img

'mars.nasa.gov/system/news_items/list_view_images/8805_1-MOXIE-PIA24176-320.gif'

## JPL Mars Space Images - Featured Image

In [8]:
# Visiting the second browser and storing the html code
browser.visit("https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")
html = browser.html
soup = bs(html, "html.parser")

In [9]:
# Again, we need the main webpage to get the the complete url for the image working.
jpl_mainpage = "jpl.nasa.gov"

# We get the a tag with a unique id, and then we extract the first element containing the href link
featured_image = soup.find_all("a", id="full_image")
featured_image = jpl_mainpage + featured_image[0]["data-fancybox-href"]

In [10]:
main_dict["ft_img"] = featured_image
featured_image


'jpl.nasa.gov/spaceimages/images/mediumsize/PIA19046_ip.jpg'

## Mars Weather

In [11]:
# Visiting webpage and extracting html code, I use the method "is text present" to check if the "Insight sol" string is already
# there, meaning all the tweets have been loaded correctly, because I had troubles running the whole cell at once
latest_tweet = "None"
browser.visit("https://twitter.com/marswxreport?lang=en")
browser.execute_script("window.scrollTo(1, document.body.scrollHeight);")

# Checking if latest_tweet already has the content need, or retrying
while latest_tweet == "None":
    time.sleep(2)
    # Scrolling down a bit because the latest tweet with weather data is on the middle of the webpage. So the html code is not 
    # parsed for that tweet because it has not been loaded.
    browser.execute_script("window.scrollTo(1, document.body.scrollHeight);")
    html = browser.html
    soup = bs(html, "html.parser")
    # print(soup.prettify())

    # I create a list to iterate from all the tags and get the text of them
    article_list = []

    # Every tweet that contains text has a span tag with a class "css-901oao", so first I find all the tags with this specifications
    articles = soup.find_all("span", class_="css-901oao")

    # Then I loop to obtain the text of each class with that name
    for x in articles:
        article_list.append(x.text)

    for y in article_list:
        if y[:11] == "InSight sol":
            latest_tweet = y
            main_dict["last_tweet"] = latest_tweet
            print(latest_tweet)
            break
        else:
            latest_tweet = "None"



InSight sol 681 (2020-10-25) low -95.4ºC (-139.8ºF) high -4.4ºC (24.0ºF)
winds from the WNW at 5.6 m/s (12.6 mph) gusting to 18.6 m/s (41.6 mph)
pressure at 7.40 hPa


## Mars facts

In [12]:
url = "https://space-facts.com/mars/"
# Obtaining the first table from the website
table = pd.read_html(url)[0]
# passing the table to an HTML table
html_table = table.to_html()
# replace the \n text with nothing so it can be stored on the dictionary
html_table.replace("\n","")
main_dict["Html_table"] = html_table

## Mars Hemispheres

In [13]:
#  Visiting the webpage and getting the HTML code
browser.visit("https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars")
html = browser.html
soup = bs(html, "html.parser")

In [14]:
# Getting all the links for the webpage
all_links = soup.find_all("a", class_="itemLink product-item")

# Since every hemisphere has two equal links (one for the image and one for the title), I iterate to get only the even results: 2, 4, 6 and 8
# On every iteration, I go into the browser and get all the results needed.

# Enumerating to get iteration number
for index, link in enumerate(all_links):
    # Checking if iteration is even
    if (index % 2) == 0:
        varnum = int(index / 2)
        # Entering the first link
        browser.visit("https://astrogeology.usgs.gov" + link["href"])
        
        # Saving the html code in soup
        html = browser.html
        soup = bs(html, "html.parser")
        
        # I use the prefix from the webpage and add it to the find method to get the source and title
        main_dict[f"image{varnum}"] = "astrogeology.usgs.gov" + soup.find("img", class_="wide-image")["src"]
        main_dict[f"title{varnum}"] = soup.find("h2", class_="title").text
#         image = "astrogeology.usgs.gov" + soup.find("img", class_="wide-image")["src"]
#         title = soup.find("h2", class_="title").text
        
        # Finally, the values are appended as a dictionary into the list
#       hemisphere_image_urls.append({"title":title,"img_url":image})
        browser.visit("https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars")

In [15]:
# Checking the results
main_dict

{'news_header': 'MOXIE Could Help Future Rockets Launch Off Mars',
 'news_p': "NASA's Perseverance rover carries a device to convert Martian air into oxygen that, if produced on a larger scale, could be used not just for breathing, but also for fuel.",
 'news_img': 'mars.nasa.gov/system/news_items/list_view_images/8805_1-MOXIE-PIA24176-320.gif',
 'ft_img': 'jpl.nasa.gov/spaceimages/images/mediumsize/PIA19046_ip.jpg',
 'last_tweet': 'InSight sol 681 (2020-10-25) low -95.4ºC (-139.8ºF) high -4.4ºC (24.0ºF)\nwinds from the WNW at 5.6 m/s (12.6 mph) gusting to 18.6 m/s (41.6 mph)\npressure at 7.40 hPa',
 'Html_table': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr

In [16]:
# Exiting the browser
browser.quit()