In [35]:
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup as bs

#from langdetect import detect
import os
import csv
import re
import string
import pickle

import pandas as pd

import time

In [4]:
#import nltk

#nltk.download('stopwords')
#nltk.download('punkt')

To clean the texts, here are some useful functions

In [5]:
class BasePreProcessor:
    language_map = {'en': 'english', 'es': 'spanish', 'fr': 'french', 'ru': 'russian', 'ar': 'arabic'}
    stop_words = set(stopwords.words('english'))

    def __init__(self):
        self.language_count = {}
        # Keep a record of links to check & nr removed for having too-few and too-many
        self.to_check = []
        self.too_few = 0
        self.too_many = 0

    def preprocess(self, file_path, html = 'auto'):
        
        assert os.path.splitext(file_path)[1] == '.txt', f"{file_paht}: pre-processing assumes .txt input"
        
        #Open the txt file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        #Skip empty (often scans or images only)
        if len(text) == 0:
            return(None)
        
        #Check if text contains HTML -- if so/set to True, strip to only headings, paragraphs and lists
        if html == True or (html == 'auto' and bool(bs(text, "lxml").find())):
            text =  self.text_from_html(text, file_path)
            
        
        #Process the text
        sentence_list = self.process_string(text)
        
        #We can set a minimum/maximum number of sentences; ignore for now
        # if SENTENCES_PER_PAGE_MIN <= len(sentence_list) < SENTENCES_PER_PAGE_MAX:
        #     page_dict[page] = sentence_list
        # elif len(sentence_list) < SENTENCES_PER_PAGE_MIN:
        #     self.too_few += 1
        # elif len(sentence_list) >= SENTENCES_PER_PAGE_MIN:
        #     self.too_many += 1
            
        return sentence_list
    
    def text_from_html(self, html_text, file_path):
        soup = bs(html_text)
        #Many pages (incl. most news items) have their main text in one div
        texts = [t.text for t in soup.find_all("div", {"data-module": "govspeak"})]
        texts = " ".join(texts)
        
        #Otherwise, take paragraphs, headings, lists on page
        if len(texts)>30:
            return(texts)
        else:
            #print(f"No govspeak for {file_path}")
            texts = [x.text for x in soup.find_all(['p', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'])]

            #Remove header and footer. 
            #Some pages have a non-standard layout but it seems many end their header with the same line
            try:
                head_index = texts.index('\n  Where we have identified any third party copyright information you will need to obtain permission from the copyright holders concerned.\n')
                if texts[head_index+1].startswith('\n  This publication is available at'):
                    texts = texts[head_index+2:]
                else:
                    texts = texts[head_index+1:]
            except ValueError as e:
                self.to_check.append((file_path, e))
                #print(f'No header found for {file_path}', flush=True)
            #Similarly, pages by default end with a "is this page useful?" button
            try:
                foot_index = texts.index('Is this page useful?')
                texts = texts[:foot_index]
            except ValueError as e:
                self.to_check.append((file_path, e))
                #print(f'No footer found for {file_path} - {e}', flush = True)
                
            return(" ".join(texts))

    def process_string(self, messy_text, 
                       MULTIPLE_PUNCTUATION = 4, 
                      SENTENCE_MIN_CHARACTERS = 3, SENTENCE_MIN_WORDS = 1,
                       SENTENCE_MAX_WORDS = False
                      ):
        """ Tokenize on sentence level and do some basic filtering:
            - Replace newline characters with a space
            - Remove URLs, remove e-mail addresses.
            - Attempts to remove table of contents by filtering lines with many multiples of dots and white spaces (e.g. 1.csv).
            - Remove lines with at least half of characters as numerics/punctuation (helps with table of contents)
            - Remove too short/too long sentences (may help with tables)
            - Lowercase everything
            This function can be sped up, but let's keep it uncomplicated.
            The downsides of such simple pre-processing are:
            - Sentences may be cut in two, or joined by sentences from captions.
            - Title pages, tables of contents, lists of tables and figures are generally difficult to filter out.
            - The sentence tokenizer is not able to deal with errors from the PDF parser.
            - Cannot filter out addresses.
        """
        # == General text-based pre-processing ==
        # Replace hyphens with spaces.
        messy_text = messy_text.replace('- ', '-')
        # Remove urls, e-mail, newline characters (both windows & mac format)
        messy_text = re.sub(r'http\S+', '', messy_text)
        messy_text = re.sub(r'\S+@\S+(?:\.\S+)+', '', messy_text)
        messy_text = re.sub(r'\r\n|\r|\n', ' ', messy_text)
        
        # == Sentence-based filtering ==
        # Use NLTK to tokenize sentences. Assumes all texts are English
        tokenized_sentences = sent_tokenize(messy_text, language='english')
        
        # Attempts to remove tables of contents, if white space or dots occurs more than MULTIPLE_PUNCTUATION.
        dot_str, space_str = '.' * MULTIPLE_PUNCTUATION, ' ' * MULTIPLE_PUNCTUATION
        tokenized_sentences = [sentence for sentence in tokenized_sentences
                               if (dot_str not in sentence and space_str not in sentence)]
        
        # Remove sentences if more than half the sentence is numeric.
        tokenized_sentences = [sentence for sentence in tokenized_sentences if len(sentence) > 0]
        tokenized_sentences = [sentence for sentence in tokenized_sentences
                               if (len(''.join(re.findall(r'\d+', sentence))) / len(sentence)) < 0.5]
        
        # Remove short and long sentences (long can become important if we want to do  embeddings)
        tokenized_sentences = [sentence for sentence in tokenized_sentences
                               if (len(sentence) >= SENTENCE_MIN_CHARACTERS and len(sentence.split()) >= SENTENCE_MIN_WORDS)]
        if SENTENCE_MAX_WORDS:
            tokenized_sentences = [sentence for sentence in tokenized_sentences if len(sentence.split()) < SENTENCE_MAX_WORDS]
        
        # # Remove all caps sentences.
        # tokenized_sentences = [sentence for sentence in tokenized_sentences if not sentence.isupper()]
        
        #Lowercase each sentence
        tokenized_sentences = [s.lower() for s in tokenized_sentences]
        
        #For creating the initial database, let's just return the text as one long string
        #Depending on if we choose to select by sentence or by word count or..., this may change
        return(" ".join(tokenized_sentences))

    def tokenized_preprocess(self, page_dict, language): #OLD- processes on word level; I don't think we'll need this
        for page_id, text in page_dict.items():
            # Join the string by a space.
            text = ' '.join(text)
            # Remove all numeric characters.
            non_numeric_text = ''.join(i for i in text if not i.isdigit())
            # remove punctuation.
            non_numeric_text = non_numeric_text.translate(str.maketrans('', '', string.punctuation))
            # Remove stop words.
            language_stopwords = set(stopwords.words(language))
            filtered_words = [word for word in non_numeric_text.split()
                              if word not in language_stopwords and len(word) > 2]
            stripped_sentence = ' '.join(filtered_words)
            page_dict[page_id] = stripped_sentence
        return page_dict

In [7]:
#Let's try this on our test folder
t0 = time.time()
#file_path = r'D:\OneDrive - Wageningen University & Research\Policy scraper UK NL\Data\Test\test_html.txt'
file_dir = r'C:\Users\siets009\OneDrive - Wageningen University & Research\Policy scraper UK NL\Data\TestOut'

processor = BasePreProcessor()

out = []
for i, file in enumerate(os.listdir(file_dir)):
    if file.endswith('.txt'):
        out.append(processor.preprocess(os.path.join(file_dir, file)))
        print(i, file)
            

print('\n', f"Processed {len(out)} texts in {int(time.time() - t0)} seconds")
out[:5]

0 002.21_-_Land_at_Pines_Hill_-_Design_and_Access_Statement-compressed_Redacted.txt
1 03335-CO-Integrated-Review-Foreward-and-Overview-FINAL-RUSSIAN-WEB-DISPLAYABLE-PDF.txt
2 03335-CO-Integrated-Review-Foreword-and-Overview-FINAL-ARABIC-WEB-DISPLAYABLE-PDF.txt
3 03335-CO-Integrated-Review-Foreword-and-Overview-FINAL-CHINESE-WEB-DISPLAYABLE-PDF.txt
4 03335-CO-Integrated-Review-Foreword-and-Overview-FINAL-FRENCH-WEB-DISPLAYABLE-PDF.txt
5 03335-CO-Integrated-Review-Foreword-and-Overview-FINAL-SPANISH-WEB-DISPLAYABLE-PDF.txt
6 0342.txt
7 1-public-evidence-session-9-july-2013.txt
8 1-s2.0-S1876610214023558-main.txt
9 13-809-future-manufacturing-project-report.txt
10 13_1574.txt
11 143-foi-12-0449-0450--environmental-information-regul.txt
12 14_0483.txt
13 14_0639.txt
14 151209_publication_v1_4.txt
15 1943-nps-nuclear-power-annex-volII.txt
16 2014_15241.txt
17 2014_15314.txt
18 2018-final-emissions-statistics-summary.txt
19 2018_EASO_COI_Nigeria_TargetingIndividuals.txt
20 2018_Final_greenho

["l a n d e a s t o f p i n e s h i l l stansted mountfitchet design and access statement february 2023 land east of pines hill, stansted mountfitchet:design and access statement this design and access statement has been prepared by on architecture on behalf of luxus homes ltd. this document has been designed to be printed double sided at a3 (landscape). logan house, first published by on architecture ltd, august 2021. st andrews close, canterbury, © 2021 on architecture ltd (unless otherwise stated within this document) prepared by elm / tw ct1 2rp all rights reserved. no part of this publication may be reproduced, stored in retrieval systems, or transmitted, in any form or by any checked by jr means electronic, mechanical, photocopying, recording or otherwise without prior permission of on architecture ltd. t:01227 634 334 date issued february 2023  please note: revision - w: unless otherwise stated all drawings, maps, images and diagrams contained within this document are not to sca

This seems to work decently, though some long pdfs take a while.
We can therefore run it on the whole dataset & this is probably a good time to make a single dataframe with all meta-data included. Pandas seems useful for this; basically, we do full outer joins of the meta-data. We can then add in the processed text, using the mapping table where necessary.

In [27]:
#Get the meta data from here
download_dirs = [ 
    r'C:\Users\siets009\OneDrive - Wageningen University & Research\Policy documents UK NL\Data\230524_GlobalWarming',
    r'C:\Users\siets009\\OneDrive - Wageningen University & Research\Policy documents UK NL\Data\230524_ClimateChange',
    r'C:\Users\siets009\OneDrive - Wageningen University & Research\Policy documents UK NL\Data\230524_Climate'
]

df = pd.DataFrame()
for prior_dir in download_dirs:
    with open(os.path.join(prior_dir, 'metadata.txt'), 'r') as f:
        meta = eval(f.read())
    df = pd.concat([df, pd.DataFrame.from_dict(meta, orient='index',
                                              columns = ['date', 'department', 'category', 'source_link', 'download_type']),
                   ])
df.reset_index(names = 'original_file', inplace=True)
print(df.shape)
df.head()

(25922, 6)


Unnamed: 0,original_file,date,department,category,source_link,download_type
0,C:\Users\ajsie\OneDrive - Wageningen Universit...,15 November 2022,Government Property Function,guidance_and_regulation,https://www.gov.uk/government/publications/gov...,linked_pdf
1,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf
2,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf
3,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf
4,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf


In [29]:
df['department'].value_counts()[:10]

Planning Inspectorate                                    2597
                                                         2244
Department of Energy & Climate Change                    2036
Environment Agency                                       1360
Department for Environment, Food & Rural Affairs         1316
Foreign & Commonwealth Office                            1130
Prime Minister's Office, 10 Downing Street               1001
Department for Energy Security and Net Zero               884
Foreign, Commonwealth & Development Office                864
Department for Business, Energy & Industrial Strategy     849
Name: department, dtype: int64

In [39]:
with open(r'C:\Users\siets009\OneDrive - Wageningen University & Research\Policy scraper UK NL\Data\230524_ConvertedPDFs\mappingToPDFs.pickle', 'rb') as f:
    mapping = pickle.load(f)


df['txt_file'] = None

for i in mapping:
    df.loc[df['original_file'] == i[0], 'txt_file'] = i[1]
df.head()

Unnamed: 0,original_file,date,department,category,source_link,download_type,txt_file
0,C:\Users\ajsie\OneDrive - Wageningen Universit...,15 November 2022,Government Property Function,guidance_and_regulation,https://www.gov.uk/government/publications/gov...,linked_pdf,
1,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf,
2,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf,
3,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf,
4,C:\Users\ajsie\OneDrive - Wageningen Universit...,,,guidance_and_regulation,https://www.gov.uk/government/publications/ene...,linked_pdf,


In [42]:
len(df['txt_file'].unique())

6560

In [43]:
print(len(mapping))

11111
