In [57]:
# import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import bs4
import os
import re # regex to extract hrefs and names
from urllib.parse import urljoin # for joining url segments

In [17]:
core_url = "https://www.mgmt.ucl.ac.uk"
response = requests.get(urljoin(core_url, '/study'))
soup = BeautifulSoup(response.text, 'html.parser')

In [18]:
# get all list items from HTML data within the 'menu nav' class unordered list
study_pages = soup.find('ul', class_='menu nav').find_all('li')

In [19]:
# convert to string
study_pages = [str(page) for page in study_pages]

In [20]:
# list of potential degree types
degree_types = ['BSc', 'MSc', 'MRes', 'MBA', 'PhD']

# filter pages to only those that are about a degree
degree_pages = [page for page in study_pages if any(word in page for word in degree_types)]

# extract and map url endings for each degree
degree_pages_url = {
    re.search(r'href="/\b[a-zA-Z-]+', page).group().replace('href="', '') : # retrieves href
    re.search(r'">[A-Za-z/ ]+', page).group().replace('">', '') # retrieves programme name
    for page in degree_pages
}

In [21]:
# keys (url ending)
print(list(degree_pages_url.keys()))

# values (page name)
print(list(degree_pages_url.values()))

['/imb', '/management-science', '/business-analytics', '/entrepreneurship', '/finance', '/management', '/ucl-mba', '/mba-peking-university', '/phd-financial-economics', '/phd']
['BSc Information Management For Business', 'BSc/MSci Management Science', 'MSc Business Analytics', 'MSc Entrepreneurship', 'MSc Finance', 'MSc Management', 'The UCL MBA', 'The UCL MBA with Peking University', 'MRes and PhD in Financial Economics', 'MRes and PhD in Management']


In [38]:
os.mkdir('test/within')

In [66]:
# save each degree corpus, degree paragraph, in separate files
for url_ending in degree_pages_url:
    # fetch HTML
    url = urljoin(core_url, url_ending)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tags = soup.find('div', class_='field-item even') # this div contains the degree text content

    # create the file name, replacing any illegal / characters with - 
    file_name = os.path.join('corpus', degree_pages_url[url_ending].replace('/', '-') + ' corpus' + '.txt')
    
    # extract paragraphs from website HTML
    paragraphs = [tag.get_text(separator = ' ').replace('\xa0', ' ') for tag in tags.select('p,ul,div.msi-faq-answer') if tag.get_text() not in ['\xa0', '', '\n']]
    # remove tab characters
    paragraphs = [re.sub('\t', '', para) for para in paragraphs]
    # remove spaces at start of lines
    paragraphs = [re.sub('\n ', '\n', para) for para in paragraphs]
    # remove unicode zero width space character
    paragraphs = [re.sub('\u200b', '', para) for para in paragraphs]
    # remove extra spaces
    paragraphs = [re.sub(' +', ' ', para) for para in paragraphs]
    
    # save whole corpus
    with open(file_name, 'w', encoding='utf-8') as f:
        scraped_text = '\n\n'.join(paragraphs)
        f.write(scraped_text) # extract the text from html and write to file
        
    # save each individual paragraph ("context") as a separate file
    counter = 1
    # make folder to store contexts
    context_folder = os.path.join('corpus', url_ending.replace('/', ''))
    # only make subfolder if it doesn't exist
    if not os.path.exists(context_folder):
        os.mkdir(context_folder)
    
    # save each paragraph as separate file
    for para in paragraphs:
        # name paragraph as n.txt
        context_file_name = str(counter) + '.txt'
        with open(os.path.join(context_folder, context_file_name), 'w', encoding='utf-8') as f:
            f.write(para)
        counter+=1