# Load Summarisation Pipeline

Taken from Mitch :)

In [38]:
from transformers import pipeline

class Summariser:

    def __init__(self, max_length, min_length, type=False, do_sample=False):
        """ Summarises text inputs """

        if type:
            trained = type
        else:
            trained = 'xsum'
        self.summarizer = pipeline('summarization', model=f'facebook/bart-large-{trained}')

        self.max_length = max_length
        self.min_length = min_length
        self.do_sample = do_sample

    def summarise(self, text):
        """ 
        max_length (int): Maximum length of the generated summary.
        min_length (int): Minimum length of the generated summary.
        do_sample (bool): Whether to use greedy sampling when generating summaries.
        
        Returns: summary
        """   
        try:
            summary = self.summarizer(text, max_length=self.max_length, min_length=self.min_length, do_sample=self.do_sample)
            return summary
        except:
            return None

# Summarise Documents

In [39]:
import pickle
import re
import os

class Tender:
    def __init__(self, reference):
        self.reference = reference
        self.file_map = {}
        
    def clean_text(self, text):
    # convert from binary string if needed
        if type(text) == bytes:
            text = text.decode("utf-8")
        text = re.sub("[^a-zA-z0-9.,]", " ", text)
        text = re.sub("\\\\", " ", text) 
        text = re.sub("\s+", " ", text)
        text = re.sub("\.+", ".", text)
        return text   
    
    def add(self, file_name, content):
        if content == None:
            print(f"Warning: None content for ref:{self.reference}, fname:{file_name}")
        else:
            content = self.clean_text(content)
            
        if file_name in self.file_map:
            # hopefully wont happen
            print(f"Warning: duplicate file name added for ref:{self.reference} fname:{file_name}")
        else:
            self.file_map[file_name] = content
    
    def save(self):
        with open(f"{self.reference}.pickle", 'wb') as file_handle:
            pickle.dump(self.file_map, file_handle, protocol=pickle.HIGHEST_PROTOCOL)
           
    @staticmethod
    def __correct_handle(reference):
        if ".pickle" in reference: # assume its a fpath, dont change
            return reference
        else: # try ref.pickle
            return f"{reference}.pickle"
        
    @staticmethod
    def exists(reference):
        return os.path.exists(Tender._Tender__correct_handle(reference))
            
    @staticmethod
    def load(reference):
        if Tender.exists(reference):
            with open(Tender._Tender__correct_handle(reference), 'rb') as file_handle:
                t = Tender(reference)
                t.file_map = pickle.load(file_handle)
                return t
        else:
            return None

In [45]:
import itertools
import os
import pickle

summariser = Summariser(max_length=30, min_length=10, do_sample=False)

path = r"/home/ucc/maxichat/Capstone/data/tender_raw"
os.chdir(path)
for file in os.listdir(path):
    if file.endswith(".pickle"):
        t = Tender.load(file)
        if t != None:
            if "TITLE" in t.file_map and "DESCRIPTION" in t.file_map:  
                title = t.file_map["TITLE"]
                description = t.file_map["DESCRIPTION"]
                if len(t.file_map.keys()) > 2:
                    extra_text = ""
                    for key in t.file_map.keys():
                        if key != "TITLE" and key != "DESCRIPTION":
                            extra_text += t.file_map[key] + "\n"
                    extra_text = extra_text[0:3000]
                    summary = summariser.summarise(extra_text)
                    print(title)
                    print(description)
                    print(extra_text)
                    break

Department of Education Kalamunda Senior High School Stem Upgrade
Department of Education Kalamunda Senior High School Stem Upgrade A non mandatory site inspection and or tender briefing will be held on Tuesday 29th November 2022 at 10.00 am. To confirm attendance at this briefing, you must contact Michael Spight, Director, TAG Architects Ph 9227 0511 or via email on michael tagarchitects.com.au
0 25Government of Finance Department of Buildings and ContractsWestern Australia Q.OmaHDA Hydraulics Design Australia1 300 Fitzgerald Street Perth WA 6000 Telephone 08 9225 9300 Facsimile 08 9388 6117 www.hdawa.com.auHydraulics Design AustraliaGarden BedFootpathStaff Carpark CourtyardWalkway
0 25Government of Finance Department of Buildings and ContractsWestern Australia Q.OmaHDA Hydraulics Design Australia1 300 Fitzgerald Street Perth WA 6000 Telephone 08 9225 9300 Facsimile 08 9388 6117 www.hdawa.com.auHydraulics Design Australia
3327 BusCANNING ROAD 0 25Government of Finance Department of Bu