# Library of Sites

In [None]:
#implement library here

# Library of Key Terms

In [None]:
#implement library here

# Webscraper

In [None]:
# general 

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import unicodedata


class Article:
  def __init__(self, title, link, author=None):
    self.title = title
    self.link = link
    self.author = author


keywords = [
    "ocean",
    "polar",
    "electric",
    "nature",
    "iceberg",
    "biodiversity",
    "green",
    "warm",
    "biology",
    "plant",
    "living",
    "carbon",
    "coronavirus"
]

In [None]:
#Primary Webscraping Functions



# currently this only lowercases the title
# so stuff like "polar" and "Polar" match
# but other stuff like removing whitespace
# or more can be added later if needed
def normalize(title):
    return title.lower()




"""
articles is a { dictionary } of {"article title": "article url"}
keywords is a [ list ] of keywords

filter_for_keywords will return a new dictionary which only contains the articles
whose titles contain at least 1 of the keywords
"""
def filter_for_keywords(articles, keywords):
    # make an empty dictionary, which we will return at the end
    filtered_articles = {}
    
    for title, url in articles.items():
        # normalize ensures that two strings are in the same format
        # currently it only lowercases, them but it could be expanded to do more
        # like removing whitespace, etc...
        # this function is needed so stuff like "polar" and "Polar" match

        # normalize the title outside of the loop so its only done one time, and not once per keyword
        normalized_title = normalize(title)

        for keyword in keywords:
            # if keyword not in title, then try again with the next keyword
            if normalize(keyword) not in normalized_title:
                continue

            # if the code reaches here, then there was a successful keyword/title match
            # so add the title,url pair to the filtered dictionary
            # make sure to use the actual title and not the normalized version
            filtered_articles[title] = url

            # exit the loop early, no need to continue and go over every other keyword remaining
            break
    
    return filtered_articles


"""
Modified bbc web scraper code from the colab to obtain test values for the filtering function as a dictionary intead of list:
"""
def get_scraped_dictionary_from_bbc():
    website = "https://www.bbc.com/news/science_and_environment"
    # pattern is a regular expression for something that the links we actually want will contain
    pattern = re.compile("/news/science-environment")

    """ actual code starts here: """

    # grab html from website
    html = requests.get(website).text

    # parse out the link tags (<a></a>) from the html
    soup = BeautifulSoup(html, features="html.parser")
    all_link_tags = soup.find_all("a", href=pattern)

    articles = {}
    for tag in all_link_tags:
        # article headline is inside <h3> tag inside <a> tag 
        headline_tag = tag.find("h3")

        # some links are not for article headlines and thus don't have <h3> tags inside
        # so ignore them and add the others to the articles dictionary
        if headline_tag is None:
            continue

        # if you made it this far, the headline_tag exists and can be added
        # grab the actual headline from the <h3> tag
        headline = headline_tag.contents[0]

        # href is the attribute inside which the actual link url is stored
        url = tag["href"]
        # scraped url's don't include the bbc.com stuff at start, so add that here
        url = f"https://www.bbc.com{url}"
        # finally, add the (headline, url) pair to our articles dictionary
        articles[headline] = url
            
    return articles


def get_scraped_dictionary_from_mlive():
    website = "https://www.mlive.com/"

    # grab html from website
    html = requests.get(website).text
    print(html)
    # parse out the link tags (<a></a>) from the html
    # only select tags with attribute data-ga-content-type = article
    soup = BeautifulSoup(html, features="html.parser")
    all_link_tags = soup.find_all("a", {"data-ga-content-type":"article"})

    articles = {}

    #create dictionary mapping article titles to article links
    for tag in all_link_tags:
      # ignore tags that do not contain anything
      if tag.string is None:
        continue
        #tag headline can be obtained at tag.string
        #tag url stored in the attribute "href", accessed with tag["href"] 
      articles[tag.string] = tag["href"]
            
    return articles


def get_scraped_dictionary_from_detroit_news():
    website = 'https://www.detroitnews.com/news/'
    
    # get html from website
    html = requests.get(website).text

    # parse html to find the link
    soup = BeautifulSoup(html, features="html.parser")
    all_link_tags = soup.find_all("a", class_="gnt_m_flm_a")
    print(all_link_tags)
    # make an empty dictionary to contain articles to return later
    articles = {}

    for tag in all_link_tags:
      # ignore tags that do not contain anything
      #if tag.string is None:
      #  continue
        #tag headline can be obtained at tag.string
        #tag url stored in the attribute "href", accessed with tag["href"] 
      #title = tag.find_all({"data-c-br":"")
      if len(tag["class"]) > 1:
        continue
      if not tag.has_attr("href"):
        print(tag)
        return

      title = unicodedata.normalize("NFKD", tag["data-c-br"])
      articles[title] = "https://www.detroitnews.com" + tag["href"]
    print(articles)
    return articles








In [None]:
# Attempt at creating a generalized scraper (common set of functions for all websites)


#returns a soup object given a URL and content type (html or xml)
def createSoupObject(url, pageType):
  response = requests.get(url)
  if response.status_code != 200:
    print('Failed to access ' + url)
  text = response.text
  if (pageType == "html"):
    return BeautifulSoup(text, features="html.parser")
  elif (pageType == "xml"):
    return BeautifulSoup(text, features="xml")
  else:
    print("Invalid pageType: please select either html or xml")
    return False

#searches soup object for all lines with a given tag and attributes
def extractAllWithTag(soup, tag, attrs, urlPattern=None):
  if (urlPattern != None):
    return soup.find_all(tag, attrs, href=re.compile(urlPattern))
  else:
    return soup.find_all(tag, attrs)



In [None]:
#Run and test code in this section


articles = get_scraped_dictionary_from_mlive() 
print(filter_for_keywords(articles, keywords))

#articles = get_scraped_dictionary_from_mlive()
# print(filter_for_keywords(articles, keywords))

#soup = createSoupObject("https://www.mlive.com/", "html")
#tags = extractAllWithTag(soup, "a", {"data-ga-content-type":"article"})

<!DOCTYPE html><html lang="en"><head><script async="" src="https://polyfill.io/v3/polyfill.min.js"></script><link id="fusion-template-styles" rel="stylesheet" type="text/css" href="/pf/dist/components/combinations/default.css?d=469"/><link rel="stylesheet" href="/pf/resources/dist/mlive/css/style.css?d=469"/><meta name="pageType" value="homepage"/><script type="application/javascript">if(!Array.prototype.includes||!(window.Object && window.Object.assign)||!window.Promise||!window.Symbol||!window.fetch){document.write('<script type="application/javascript" src="/pf/dist/engine/polyfill.js?d=469" defer=""><\/script>')}</script><script id="fusion-engine-script" type="application/javascript" src="/pf/dist/engine/react.js?d=469" defer=""></script><script id="fusion-engine-script" type="application/javascript" src="/pf/dist/components/combinations/default.js?d=469" defer=""></script><link rel="icon" type="image/x-icon" href="/pf/resources/images/mlive/favicon.ico?d=469"/><link rel="manifest"

# Collecting Images

In [None]:
#put code here

# Identifying Tone

In [None]:
#put code here

# Article Location

In [None]:
#Put code here

# Website Layout

In [None]:
#put code here