# Load Summarisation Pipeline

In [None]:
from transformers import pipeline

class Summariser:

    def __init__(self, max_length, min_length, do_sample=False):
        """ Summarises text inputs """
    
        self.summarizer = pipeline('summarization', model='facebook/bart-large-xsum')
        self.max_length = max_length
        self.min_length = min_length
        self.do_sample = do_sample
        self.summarised_docs = {}

    def summarise_docs(self, docs):
        """ 
        max_length (int): Maximum length of the generated summary.
        min_length (int): Minimum length of the generated summary.
        do_sample (bool): Whether to use greedy sampling when generating summaries.
        
        Returns: dict of summaries per ref
        """
        
        for ref, text in docs.items(): 
            try:
                summary = self.summarizer(text, max_length=self.max_length, min_length=self.min_length, do_sample=self.do_sample)
                self.summarised_docs[ref] = summary[0]['summary_text']
            except:
                print(f"Document for '{ref}' is most likely too long")
                continue
        
        return self.summarised_docs
        
    def summary(self, ref):
        """ Return summaries from input ref """
        return print(self.summarised_docs[ref])

In [None]:
fake_dictionary = {
    "ref1": {
        "doc1": "Motsugo is a minnow sized fish, also known as the Topmouth Gudgeon. It was introduced in the 1960s into ponds in Nucet, Dâmboviţa County, Romania and it made its way into Danube, then spreading throughout Europe. They pose danger to another species such as the sunbleaks (Leucaspius delineatus), as they are the carrier of a parasite (Sphaerothecum destruens) which while not damaging to the topmouth gudgeon, attacks other fishes like the sunbleaks, which are unable to spawn and have a higher mortality when infected.",
        "doc4": "Motsugo was purchased new by the club, with money from donations and club funds. From the outset, it was designed and built as a file server to replace martello (and do the job much more reliably). 12G more RAM added 20110617.",
        "doc5": "Host of secure user home directories, main user shell server. Took over Martello's tasks. Still used for a couple of Member VMs because people are scared of Medico. Frames loves medico more than everyone else, combined, and is running many big jobs on it for his final year project (2014).",
    },
    "ref2": {
        "doc1": "It was originally purchased by the club in an auction for approximately $600 in 2017. Maltair was then a 1RU IBM System x3550 M4 server with dual 8-core Xeon E5-2680 CPUs and 192GB DDR3 ECC RAM (1600 MHz). Originally labelled maltair, this became maltair (dead) and later old maltair (dead). Following RCD testing one day in August 2018, it very suddenly became completely dead. As it turns out, an IMM2 firmware bug affecting that particular model of server caused a current surge through the built-in voltage regulators every time it was powered on, which could vastly decrease the lifetime of the components and eventually would result in a system board failure requiring a replacement of the entire motherboard. ([BOB] attempted to locate and replace the failed component itself, however in the end it was unsuccessful).",
        "doc4": "A new server of the same model but vastly inferior specifications was then purchased from eBay for $400 to provide a replacement motherboard which was compatible with the components from the original server. This was nicknamed New Maltair for some time, until it died and then became New Dead Maltair. Components were transferred and the firmware was upgraded, which left a functional system almost identical to the original M4.",
        "doc5": "Curiously enough, in February 2019, the replacement x3550 M4 hardware for Maltair died in the exact same fashion as its predecessor, apparently without regard for the firmware upgrade that was intended to fix the issue. Instead, the current HP DL380p G8 hardware was purchased as a replacement. This latest revision is often referred to as New Undead Maltair.",
    },
    "ref3": {
        "doc1": "The University Computer Club is a student club located in Perth, Western Australia at the University of Western Australia with the objective of advancing computer science and technology in the University community. Many students call the UCC home and spend numerous hours hanging out, working on personal projects, gaming, coding and trying to finish assignments in the clubroom.",
        "doc4": "If you want to see things the club has done, have a look at our networked coke machine and Internet enabled doors. The clubroom is generally open from around 9am till 10pm on weekdays and from around 1pm till 10pm on weekends. Opening times may vary wildly depending on weather, schedules and holidays. The easiest way to check if someone is around is by using the webcams. We have directions for getting to the clubroom from Stirling Hwy and the Guild Village.",
        "doc5": "The UCC would be absolutely nothing without the generous sponsorship it receives from companies, organisations and individuals, both in Perth and around the world. Through them, the UCC has been able to get hardware, software and other requirements for its members that it would otherwise be unable to afford.",
    },
}

# Summarise Documents

In [None]:
def combine(d):
    tmp = {}
    for ref, docs in d.items():
        ref_text = ' '.join(docs.values())
        tmp[ref] = ref_text
    return tmp

combined_docs = combine(fake_dictionary)
for ref, text in combined_docs.items():
    print(f"Reference '{ref}':\n{text}\n")

In [None]:
summariser = Summariser(max_length=50, min_length=10, do_sample=False)

summarised_docs = summariser.summarise_docs(combined_docs)

for ref, summary in summarised_docs.items():
    print(f"Summary for {ref}:\n{summary}")

In [None]:
summariser.summary("ref1")