# Citation Scrape
This notebook look at obtaining the citations and their relations from UK Public General Acts (UKPGA).
<br><br>
This is achieved by loading in a csv file containing all current acts under the category (as of 2022-06-27),
then using id of each document to access their official document hosted on [legislation.gov.uk](www.legislation.gov.uk).
Each document is stored in an .xhtml file format that follows the LegalDocML schema. Therefore, scraping the hosted
webpage, searching for `a` tags that contain `class="LegCitation"` will get every citation found within the document.
<br><br>
We then look to labelling each citation into their respective relation to the current document. By looking at the
parent text surrounding the citation (the text before and after the citation), for the majority of citations, we can
identify the relationship. This can be achieved by looking for keywords for the relationship within the text, where a match is found then
the citation can be labelled with said relation.
<br><br>
Finally, all the cited documents and their citation relation graph is then stored in a csv file.

## Imports

In [None]:
import requests
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup
from re import search, IGNORECASE

## Opening the files

### File Paths

In [None]:
LEGISLATION_LIST_FILE_PATH = '# ENTER LEGISLATION LIST FILE PATH HERE'
LABEL_FILE_PATH = '# ENTER LABEL FILE PATH HERE'
CITATION_NETWORK_FILE_PATH = '# ENTER CITATION NETWORK FILE PATH HERE'

### List of Legislation

In [None]:
legislation_df = pd.read_csv(LEGISLATION_LIST_FILE_PATH)
legislation_list = zip(legislation_df['type'], legislation_df['year'], legislation_df['number'])

### List of Keywords / Labels

In [None]:
def get_labels():
    with open(LABEL_FILE_PATH, encoding='utf-8') as f:
        labels = f.read().split('\n')

## Scraping

### Opening URL

In [None]:
def get_from_url(doc_id):
    url = f'https://www.legislation.gov.uk/{doc_id}/data.xht?view=snippet&wrap=true'
    r = requests.get(url)
    return BeautifulSoup(r.content)

### Extraction of Citations

In [None]:
def get_citations(doc_soup):
    return doc_soup.find_all('a', 'LegCitation')

### Extraction of Parent Text

In [None]:
def get_parent_text(citation_list):
    citation_passage_list = []
    for citation in citation_list:
        citation_passage_list.append(citation.parent.parent.text)
    return citation_passage_list

## Labelling of Citations

### Keyword Identification

In [None]:
def label_citations(citations, parent_text):
    labels = get_labels()
    citation_passage_label_list = []
    for i in range(len(citations)):
        passage = parent_text[i]
        for label in labels:
            if search(label, str(passage), IGNORECASE):
                citation_passage_label_list.append([label, citations[i].get('href')])
                break
    return citation_passage_label_list

### Generation of Valid URLs

In [None]:
def get_data_urls(citations):
    urls = []
    for citation in citations:
        valid_citation = True
        citation_elements = citation.split('/')

        if citation_elements[0] != 'https:' and citation_elements[0] != 'http:':
            citation_elements = ('https://www.legislation.gov.uk' + citation).split('/')

        if len(citation_elements) < 4:
            valid_citation = False

        elif citation_elements[2] == 'www.opsi.gov.uk':
            citation_elements[2] = 'www.legislation.gov.uk'
            if citation_elements[3] == 'legislation':
                if citation_elements[4] == 'european':
                    if citation_elements[5] == 'directive':
                        citation_elements[4] = 'eudr'
                        citation_elements.pop(5)
                citation_elements.pop(3)

        elif citation_elements[3] == 'id':
            citation_elements.pop(3)

        elif citation_elements[3] == 'european':
            if citation_elements[4] == 'regulation':
                citation_elements[3] = 'eur'
            elif citation_elements[4] == 'decision':
                citation_elements[3] = 'eudn'
            elif citation_elements[4] == 'directive':
                citation_elements[3] = 'eudr'
            else:
                raise ValueError(f'Not sure what to do for {"/".join(citation_elements)}')
            citation_elements.pop(4)

        elif citation_elements[3] == 'uksi' or citation_elements[3] == 'ukpga' or citation_elements[3] == 'ukcm' \
                or citation_elements[3] == 'ssi' or citation_elements[3] == 'nisr':
            pass

        elif citation_elements[3] == 'ukci' and '-' in citation_elements[4]:
            valid_citation = False

        else:
            raise ValueError(f'Not sure what to do for {"/".join(citation_elements)}')

        if valid_citation:
            citation_url = '/'.join(citation_elements)
            citation_url += '/data.xht?view=snippet&wrap=true'
            urls.append(citation_url)
    return urls

## Graph Generation

### Creation of CSV File

In [None]:
def create_graph(header=None):
    if not (os.path.exists(CITATION_NETWORK_FILE_PATH)):
        if header is None:
            header = ['source','target','relation']
        with open(CITATION_NETWORK_FILE_PATH, 'w') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(header)

### Amending to CSV File

In [None]:
def save_graph(doc_id, citation_urls, labels):
    with open(CITATION_NETWORK_FILE_PATH, 'a') as f:
        csv_writer = csv.writer(f)
        doc_id=doc_id.replace(',','')
        for i, url in enumerate(citation_urls):
            url = url.replace(',','')
            url_list = url.split('/')
            target_url = f'{url_list[3]}_{url_list[4]}_{url_list[5]}'
            row = [doc_id, target_url, labels[i]]
            csv_writer.writerow(row)

## Main Program

In [None]:
for file in legislation_list:
    doc_id = f'{file[0].lower()}/{file[1]}/{file[2]}'
    try:
        print(f'Opening: {doc_id}')
        soup = get_from_url(doc_id)
        citations = get_citations(soup)
        print(f'    Found {len(citations)} citations in document')
        if len(citations) != 0:
            labelled_citations = label_citations(citations, get_parent_text(citations))
            print(f'    Labelled {len(labelled_citations)} citations in document')
            if len(labelled_citations) != len(citations):
                print(f'    {len(citations) - len(labelled_citations)} citations not labelled')

            if len(labelled_citations) == 0:
                print(get_parent_text(citations))
            else:
                citation_urls = get_data_urls([labelled_citations[i][1] for i, _ in enumerate(labelled_citations)])
                labels = [labelled_citations[i][0] for i, _ in enumerate(labelled_citations)]
                save_graph(doc_id.replace('/','_'), citation_urls, labels)
                print(f'    Successfully saved citations to CSV file')
        print()

    except Exception as e:
        print(f'\n"{doc_id}" failed\n')
        e.with_traceback()
        break