# Web Scraping Challenge

### By: Carlos Casio

In [1]:
# Import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
import requests
import pandas as pd

In [None]:
# Executing empty browser with ChromeDriveManager
executable_path = {"executable_path": ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless=False)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\Carlo\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache


## Nasa Mars News

In [None]:
# Visiting the first site and extracting the html code for the webpage. 
# The site was loading too fast so I added a match check for a list element, since all the news are listed on these classes.
# After it loads, we can continue
browser.visit("https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest")
browser.is_element_present_by_tag('LI', wait_time=10)
html = browser.html
soup = bs(html, "html.parser")

In [None]:
# Storing First image, header and brief from the first News element.
# We get the second [1] element for the header because the first one with the same class is not related to any News
Header = soup.find_all("div",class_="content_title")[1].text.strip()
paragraph =  soup.find("div",class_="article_teaser_body").text.strip()

# For the image to work, the prefix is needed so we can join both strings
mainpage = "mars.nasa.gov"
img =  soup.find("div",class_="list_image").find("img")["src"]

In [None]:
Header

In [None]:
paragraph

In [None]:
mainpage + img

## JPL Mars Space Images - Featured Image

In [None]:
# Visiting the second browser and storing the html code
browser.visit("https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")
html = browser.html
soup = bs(html, "html.parser")
print(soup)

In [None]:
# Again, we need the main webpage to get the the complete url for the image working.
jpl_mainpage = "jpl.nasa.gov"

# We get the a tag with a unique id, and then we extract the first element containing the href link
featured_image = soup.find_all("a", id="full_image")
featured_image = jpl_mainpage + featured_image[0]["data-fancybox-href"]

In [None]:
featured_image


## Mars Weather

In [None]:
# Visiting webpage and extracting html code, I use the method "is text present" to check if the "Insight sol" string is already
# there, meaning all the tweets have been loaded correctly, because I had troubles running the whole cell at once
browser.visit("https://twitter.com/marswxreport?lang=en")
browser.is_text_present('InSight sol', wait_time=10)
html = browser.html
soup = bs(html, "html.parser")
print(soup.prettify())

In [None]:
# I create a list to iterate from all the tags and get the text of them
article_list = []

# Every tweet that contains text has a span tag with a class "css-901oao", so first I find all the tags with this specifications
articles = soup.find_all("span", class_="css-901oao")

# Then I loop to obtain the text of each class with that name
for x in articles:
    article_list.append(x.text)
article_list


In [None]:
# Finally, as I only need the latest tweet related to weather, I noticed they all start with "InSight sol" string,
# so I compare the first 11 characters (lenght of that string) to the text itself, bringing the result I want
for y in article_list:
    if y[:11] == "InSight sol":
        latest_tweet = y
        print(latest_tweet)
        break

## Mars facts

In [None]:
# Visiting the URL and obtaining the HTML code
browser.visit("https://space-facts.com/mars/")
html = browser.html
soup = bs(html, "html.parser")
print(soup.prettify())

In [None]:
# Scraping table by its id
table = soup.find("table", id="tablepress-p-mars")
table

In [None]:
# Iterating through each table data from column 1 and column 2, adding to a list and returning a dataframe
col1 = table.find_all("td", class_="column-1")
Description = []
for x in col1:
    Description.append(x.text)
    
Description
    

In [None]:
# Iterating through the same table but for column 2
Value = []
col2 = table.find_all("td", class_="column-2")
for y in col2:
    Value.append(y.text)
    
Value

In [None]:
# Converting lists to DataFrame, so I can send it as an HTML table (without index) to be used later
mars_table_df = pd.DataFrame({"Description":Description,
             "Value":Value})
mars_table_df.to_html("table.html", index=False)

## Mars Hemispheres

In [None]:
#  Visiting the webpage and getting the HTML code
browser.visit("https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars")
html = browser.html
soup = bs(html, "html.parser")
print(soup.prettify())

In [None]:
# Getting all the links for the webpage
all_links = soup.find_all("a", class_="itemLink product-item")

# Creating list of all URLS dictionary
hemisphere_image_urls = []

# Since every hemisphere has two equal links (one for the image and one for the title), I iterate to get only the even results: 2, 4, 6 and 8
# On every iteration, I go into the browser and get all the results needed.

# Enumerating to get iteration number
for index, link in enumerate(all_links):
    # Checking if iteration is even
    if (index % 2) == 0:
        # Entering the first link
        browser.visit("https://astrogeology.usgs.gov" + link["href"])
        
        # Saving the html code in soup
        html = browser.html
        soup = bs(html, "html.parser")
        
        # I use the prefix from the webpage and add it to the find method to get the source and title
        image = "astrogeology.usgs.gov" + soup.find("img", class_="wide-image")["src"]
        title = soup.find("h2", class_="title").text
        
        # Finally, the values are appended as a dictionary into the list
        hemisphere_image_urls.append({"title":title,"img_url":image})
        browser.visit("https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars")

In [None]:
# Checkinig the resulting list
hemisphere_image_urls

In [None]:
# Exiting the browser
browser.quit()