# DROM B9122
# Homework Assignment 2
Yumeng Du, yd2662  
Oct 4, 2023

In [1]:
# please change it to your directory
# or leave it blank to create output txt file under the current directory
output_path = ""

## Question 1 (80 points)
Write web crawlers for the following two tasks:

1. Extract at least 10 United Nations press releases containing the word “crisis”. Start with the following seed url: 
<a>https://press.un.org/en.</a>
Notice how press release pages have the “PRESS RELEASE” relative link in the top left corner. Here is an example press release: 
<a>https://press.un.org/en/2023/sc15431.doc.htm </a>
where the “PRESS RELEASE” has the following relative anchor tag:  
<span style="font-family:Courier New;">\<a href="/en/press-release" hreflang="en">Press Release\</a></span>  
Use this information to determine whether the web page is a press release.

In [2]:
# initialization
import requests
from bs4 import BeautifulSoup
import urllib.request
import tqdm
import pandas as pd

# define functions
def check_crisis(childpage):
    '''
    Input: 
        * childpage: bs4.BeautifulSoup
    Output:
        * Boolean, whether the childpage is for an press release with word "crisis"
    ---------------------------------------------------------------------------------
    This function check whether the given webpage is a press release based on tag <a href="/en/press-release" hreflang="en">Press Release</a>
    then check whether it is a press release contains word "crisis"
        * Only perform check on the headline and body of the news article. e.g. If the sidebar of the webpage contains a link to another article with title "crisis" , we don't count it as a crisis article
        * the function identify headline by tag 'h1' with class="page-header"
        * the function identify body part by tag 'div' with class="field field--name-body field--type-text-with-summary field--label-hidden field__item"
    '''
    if childpage.find('a', **{'href':"/en/press-release",
                                'hreflang':"en",
                                'text':'Press Release'})is not None: # it is a press release!
        global header 
        header = childpage.find('h1', {'class':"page-header"}).get_text()
        global body 
        body = childpage.find('div', {'class':"field field--name-body field--type-text-with-summary field--label-hidden field__item"}).get_text()
        
        if ("crisis" in header.lower()) or ("crisis" in body.lower()):
            return True
        else: 
            return False
    # not press release, will return None

def examine_links(results_page_a,seen_a,crisis_release_a,check_fun):
    '''
    Input: 
        * results_page: bs4.BeautifulSoup, the webpage with its links to examine
        * seen_a: list collecting url already examined
        * crisis_release_a: list collecting url satisified the criteria defined by check_fun
        * check_fun: function defining the criteria
    Output:
        * updated (seen_a, crisis_release_a)
    ---------------------------------------------------------------------------------
    This function check all links in results_page with tag a and hreflang="en", whether they satisfy certain criteria
    '''
    for tag in tqdm.tqdm(results_page_a.find_all('a', href = True, hreflang="en")): #find tags with links
        # find child url
        childUrl = tag['href'] #extract just the link
        childUrl = urllib.parse.urljoin(seed_url, childUrl)
        # only examine those url we haven't examined
        if (seed_url in childUrl) and (childUrl not in seen_a):
            # add the examining one to seen list
            seen_a.append(childUrl)
            # go the chilurl and extract its html soruce code
            childpage = BeautifulSoup(requests.get(childUrl).content)
            # check and put it into crisis_release_a if it satisfies the criteria
            if check_fun(childpage):
                  crisis_release_a.append([header, childUrl,body, childpage])
        else:pass # already seen url, just ignore it
    return seen_a, crisis_release_a

In [3]:
seed_url = "https://press.un.org/en"
response = requests.get(seed_url)
if response.status_code == 200: print("Success")
else: print("Failure")

results_page = BeautifulSoup(response.content)

# check whether our seed url contains press release with word "crisis"
seen = [seed_url]    #stack of urls seen so far
crisis_release = []

seen, crisis_release = examine_links(results_page,seen,crisis_release,check_crisis)

# get the page to find more press release
morelinks = results_page.find_all('div',{'class':'more-link'})
press_r_latest = [x for x in morelinks if x.get_text()=="Latest Press Releases"][0].find("a")['href']

# check whether we have already obtained at least 10 press release with word "crisis"
if len(crisis_release)>=10:
      pd.DataFrame(crisis_release,columns=['header','url','body','html source code'])
else:
      print(f"Only get {len(crisis_release)} press release with word 'crisis' using the crrent seed url\n{seed_url}\nwill go to {press_r_latest}")

      # go to the page for press releases
      new_url = press_r_latest
      ii = 0

      while (len(crisis_release)<10): # continue until we find at least press release with word "crisis"
            print("\n\nexamining:"+new_url)
            response = requests.get(new_url)
            if response.status_code == 200: pass
            else: 
                  print("Failure to connect")
                  break
            
            results_page = BeautifulSoup(response.content)
            seen, crisis_release = examine_links(results_page,seen,crisis_release,check_crisis)
            print(f'{len(crisis_release)} crisis news found\nHave already examined {len(seen)} pieces of news')
            
            try:
                  # find the button for next page and extract the link of next page
                  pageurl = results_page.find('a',{'title':f'Go to next page',
                                                      'rel':"next", 
                                                      'class':"page-link"})['href']
                  # update new_url
                  new_url = urllib.parse.urljoin(press_r_latest, pageurl)
            except:
                  print("No further page to explore")
                  break



pd.DataFrame(crisis_release,columns=['header','url','body','html source code'])


Success


100%|██████████| 20/20 [00:02<00:00,  8.54it/s]


Only get 2 press release with word 'crisis' using the crrent seed url
https://press.un.org/en
will go to https://press.un.org/en/content/press-release


examining:https://press.un.org/en/content/press-release


100%|██████████| 11/11 [00:00<00:00, 66289.29it/s]


2 crisis news found
Have already examined 21 pieces of news


examining:https://press.un.org/en/content/press-release?page=1


100%|██████████| 11/11 [00:01<00:00,  9.08it/s]


3 crisis news found
Have already examined 31 pieces of news


examining:https://press.un.org/en/content/press-release?page=2


100%|██████████| 11/11 [00:01<00:00,  8.92it/s]


3 crisis news found
Have already examined 41 pieces of news


examining:https://press.un.org/en/content/press-release?page=3


100%|██████████| 11/11 [00:01<00:00,  9.06it/s]


4 crisis news found
Have already examined 51 pieces of news


examining:https://press.un.org/en/content/press-release?page=4


100%|██████████| 11/11 [00:01<00:00,  9.37it/s]


6 crisis news found
Have already examined 61 pieces of news


examining:https://press.un.org/en/content/press-release?page=5


100%|██████████| 11/11 [00:01<00:00,  8.68it/s]


11 crisis news found
Have already examined 71 pieces of news


Unnamed: 0,header,url,body,html source code
0,‘Outrageous a Person Dies of Hunger Every Few ...,https://press.un.org/en/2023/sgsm21980.doc.htm,Following is UN Secretary-General António Gute...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
1,"Marking International Day, Secretary-General S...",https://press.un.org/en/2023/sgsm21978.doc.htm,Following is UN Secretary-General António Gute...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
2,"Stressing ‘1.5°C Limit Is Possible’, Secretary...",https://press.un.org/en/2023/sgsm21967.doc.htm,Following is the text of UN Secretary-General ...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
3,"‘Let’s Unite, Fight Together to Keep Promise o...",https://press.un.org/en/2023/dsgsm1877.doc.htm,Following are UN Deputy Secretary-General Amin...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
4,"‘Small Island States Do Not Lack Ambition, The...",https://press.un.org/en/2023/sgsm21959.doc.htm,Following is the text of UN Secretary-General ...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
5,"Welcoming Pact, Secretary-General Tells Summit...",https://press.un.org/en/2023/sgsm21956.doc.htm,Following are UN Secretary-General António Gut...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
6,United Nations Charter ‘A How-To Manual’ on Co...,https://press.un.org/en/2023/sgsm21952.doc.htm,Following are UN Secretary-General António Gut...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
7,Global Leaders Must Take Action to Reduce Emis...,https://press.un.org/en/2023/sgsm21951.doc.htm,Following are UN Secretary-General António Gut...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
8,"Calling for Creative, Practical Financing Solu...",https://press.un.org/en/2023/sgsm21950.doc.htm,Following are UN Secretary-General António Gut...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
9,"World Needs ‘Statesmanship, Not Gamesmanship a...",https://press.un.org/en/2023/sgsm21947.doc.htm,Following is UN Secretary-General António Gute...,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."


In [4]:
# write press release with word "crisis" html source code into txt file
ii=0
while ii<10:
    f= open(f"{output_path}1_{ii+1}.txt","w+")
    f.write(str(crisis_release[ii][3]))
    f.close()
    ii+=1

2. Crawl the press room of the European Parliament and extract at least 10 press releases that cover the plenary sessions and contain the word “crisis”. Start with the following seed url:
https://www.europarl.europa.eu/news/en/press-room  
Notice how press releases related to plenary sessions contain the text “PLENARY SESSIONS” with the following html:  
<span style="font-family:Courier New;">\<span class="ep_name">Plenary session\</span></span>    
Here is an example:
https://www.europarl.europa.eu/news/en/press-room/20220620IPR33417/national-recovery-plans-meps-assess--the-performance-of-crisis-funding

In [5]:
def check_crisis(childpage):
    '''
    Input: 
        * childpage: bs4.BeautifulSoup
    Output:
        * Boolean, whether the childpage is for an press release with word "crisis" and covering plenary sessions
    ---------------------------------------------------------------------------------
    This function check whether the given webpage is a press release covering plenary sessions based on tag <span class="ep_name">Plenary session</span>
    then check whether it is a press release contains word "crisis"
        * Only perform check on the headline and body of the news article. e.g. If the sidebar of the webpage contains a link to another article with title "crisis" , we don't count it as a crisis article
        * the function identify headline by tag 'h1' with class="ep_title"
        * the function identify body part by finding all tag 'div' with class="ep_gridcolumn", 'data-view1200':"6" and the first tag will be for title, and second one is for body // no better method identified
    '''
    if childpage.find('span', **{'class':"ep_name",
                                'text':'Plenary session'})is not None: # it is a press release!
        global header 
        header = childpage.find('h1', {'class':"ep_title"}).find('span',{'class','ep_name'}).get_text()
        global body 
        body = childpage.find_all('div', {'class':"ep_gridcolumn",
                       'data-view1200':"6"})[1].get_text()

        if (("crisis" in header.lower()) or ("crisis" in body.lower())):
            return True
        else: 
            return False
    # not press release, will return None

In [6]:
def detect_crisis_more_page(seen_a,crisis_release_a):
      '''
      Input: 
            * seen_a: list collecting url already examined
            * crisis_release_a: list collecting url satisified the criteria defined by check_fun
      Output:
            * updated (seen_a, crisis_release_a)
      ---------------------------------------------------------------------------------
      This function extract html code from the updated webdriver and check all links in webpage, whether they satisfy certain criteria
      '''
      html = driver.page_source
      global more_loads
      more_loads= BeautifulSoup(html,'lxml')
      seen_b,crisis_release_b = detect_crisis_one_page(more_loads, seen_a,crisis_release_a, check_crisis)
      return seen_b, crisis_release_b

def detect_crisis_one_page(results_page_a,seen_a,crisis_release_a,check_fun):
      '''
      Input: 
            * results_page: bs4.BeautifulSoup, the webpage with its links to examine
            * seen_a: list collecting url already examined
            * crisis_release_a: list collecting url satisified the criteria defined by check_fun
            * check_fun: function defining the criteria
      Output:
            * updated (seen_a, crisis_release_a)
      ---------------------------------------------------------------------------------
      This function check all links in results_page with tag article and class="ep_gridcolumn ep-m_product ep-layout_linkmode", whether they satisfy certain criteria
      '''
      for tag in tqdm.tqdm(results_page_a.find_all('article',{'class':'ep_gridcolumn ep-m_product ep-layout_linkmode'})): #find tags with links
            childUrl = tag.find("a",{'href':True, 'title':"Read more"})['href'] # extract just the link
            childUrl = urllib.parse.urljoin(seed_url, childUrl)

            if seed_url in childUrl and childUrl not in seen_a:
                  seen_a.append(childUrl)
                  childpage = BeautifulSoup(requests.get(childUrl).content)
                  if check_crisis(childpage):
                        crisis_release_a.append([header, childUrl, body, childpage])
            else: pass
      return seen_a, crisis_release_a

In [7]:
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
import urllib.request
import tqdm


seed_url = "https://www.europarl.europa.eu/news/en/press-room"
# This code is written for Google Chrome only, please make sure you have it installed
driver = webdriver.Chrome()
# will have a pop-up window showing the webpage for seed_url
# IMPORTANT: Please do NOT close the pop-up window until you finish running all programs here!!!!!
driver.get(seed_url)

seen = [seed_url]    #stack of urls seen so far
crisis_release = []          

# get source code
html = driver.page_source
soup = BeautifulSoup(html,'lxml')

# check whether seed_url have desired releases
seen,crisis_release,  = detect_crisis_one_page(soup, seen,crisis_release, check_crisis)

ii=0
while (len(crisis_release)<10): # continue the process until we find at least 10 desired releases
    print(f'\n\nloading more *{ii+1}')
    # find the "load more" button on the page
    element = driver.find_element("id",'continuesLoading_button')
    # click the "load more button"
    # webdriver.ActionChains(driver).click(element).perform()
    driver.execute_script("arguments[0].click();", element)
    # allow some time for the source code to update [sometimes 0.5s may not be sufficient, but in the next round, it will utlimately be updated]
    time.sleep(0.5)
    # perform check
    seen, crisis_release = detect_crisis_more_page(seen,crisis_release)

    print(f'found {len(crisis_release)} crisis related press release\nAlready examine {len(seen)} pages')
    ii+=1

100%|██████████| 15/15 [00:08<00:00,  1.77it/s]




loading more *1


100%|██████████| 30/30 [00:12<00:00,  2.31it/s]


found 2 crisis related press release
Already examine 31 pages


loading more *2


100%|██████████| 45/45 [00:14<00:00,  3.08it/s]


found 3 crisis related press release
Already examine 46 pages


loading more *3


100%|██████████| 60/60 [00:14<00:00,  4.24it/s]


found 5 crisis related press release
Already examine 61 pages


loading more *4


100%|██████████| 75/75 [00:13<00:00,  5.46it/s]


found 5 crisis related press release
Already examine 76 pages


loading more *5


100%|██████████| 75/75 [00:00<00:00, 71705.68it/s]


found 5 crisis related press release
Already examine 76 pages


loading more *6


100%|██████████| 105/105 [00:14<00:00,  7.30it/s]


found 6 crisis related press release
Already examine 91 pages


loading more *7


100%|██████████| 120/120 [00:14<00:00,  8.31it/s] 


found 6 crisis related press release
Already examine 106 pages


loading more *8


100%|██████████| 135/135 [00:13<00:00,  9.77it/s] 


found 8 crisis related press release
Already examine 121 pages


loading more *9


100%|██████████| 150/150 [00:13<00:00, 11.11it/s] 


found 9 crisis related press release
Already examine 136 pages


loading more *10


100%|██████████| 165/165 [00:14<00:00, 11.72it/s] 

found 10 crisis related press release
Already examine 151 pages





In [8]:
import pandas as pd
pd.DataFrame(crisis_release,columns=['header','url','body','html source code'])

Unnamed: 0,header,url,body,html source code
0,Nagorno-Karabakh: MEPs demand review of EU rel...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\n\n\n\nParliament says current situ...,"[html, [\n, [\n, <title>Nagorno-Karabakh: MEPs..."
1,Parliament argues for a top-up to multi-annual...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\n\n\n\nRevision of EU multiannual f...,"[html, [\n, [\n, <title>Parliament argues for ..."
2,Reduce demand and protect people in prostituti...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\n\n\n\nDifferent regulations across...,"[html, [\n, [\n, <title>Reduce demand and prot..."
3,Svietlana Tsikhanouskaya to MEPs: support Bela...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\n\n\n\n\n\n\n \n \n \n \n\n\n\n\n\n...,"[html, [\n, [\n, <title>Svietlana Tsikhanouska..."
4,MEPs vote to strengthen EU defence industry th...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\n\n\n\nA €300 million budget until ...,"[html, [\n, [\n, <title>MEPs vote to strengthe..."
5,COVID-19: Parliament adopts roadmap to better ...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\n\n\n\nEvaluation of the effectiven...,"[html, [\n, [\n, <title>COVID-19: Parliament a..."
6,MEPs want to create a European Day for the vic...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\nParliament calls for an annual ‘EU...,"[html, [\n, [\n, <title>MEPs want to create a ..."
7,EP TODAY,https://www.europarl.europa.eu/news/en/press-r...,"\n\n\n\n\n\nThursday, 15 June\n\n\n\n\n\n\nPar...","[html, [\n, [\n, <title>EP TODAY | News | Euro..."
8,President Christodoulides: “no border changes ...,https://www.europarl.europa.eu/news/en/press-r...,\n\n\n\n\n\n\n\n\n\n\n\n \n \n \n \n\n\n\n\n\n...,"[html, [\n, [\n, <title>President Christodouli..."
9,EP Today,https://www.europarl.europa.eu/news/en/press-r...,"\n\n\n\n\n\nMonday, 12 June\n\n\n\n\n\n\nLast-...","[html, [\n, [\n, <title>EP Today | News | Eur..."


In [9]:
# output html sources code into txt file
ii=0
while ii<10:
    f= open(f"{output_path}2_{ii+1}.txt","w+")
    f.write(str(crisis_release[ii][3]))
    f.close()
    ii+=1

## Question 2
Create a Git repository on the GitHub platform. Name the repository <span style="font-family:Courier New;">b9122_homework2</span>, make it publicly available and perform the following:
* Populate the repository with the webcrawler code that we covered in class and the webcrawler code files that you created in Question 1.
* Create a <span style="font-family:Courier New;">README.md</span> file where you will provide information about the repository, including author information and a description of the code files
* Make changes to at least one of the added files (whatever changes you prefer). 
* Update the repository with the edited file/s.
* For those of you who will be doing the interaction with the github repository using git commands, perform the following:
    * The <span style="font-family:Courier New;">git log</span> command displays the commit logs. Use output redirection(“>”)tostore the output of this command in a file named <span style="font-family:Courier New;">gitlog.txt</span>.Submit the <span style="font-family:Courier New;">gitlog.txt</span> file and the <span style="font-family:Courier New;">url</span> of your repository
* For those of you who will be using the GitHub Desktop application, perform the following:
    * The “History” tab displays the repository activities. Open this tab and take a screenshot.
    Submit the screenshot image and the url of your repository.

**Answer**  
https://github.com/EthelYMDu/b9122_homework2

<img src="gitdesktop.png">