# Load Summarisation Pipeline

In [None]:
from transformers import pipeline

class Summariser:

    def __init__(self, max_length, min_length, type=False, do_sample=False):
        """ Summarises text inputs """

        if type:
            trained = type
        else:
            trained = 'xsum'
        self.summarizer = pipeline('summarization', model=f'facebook/bart-large-{trained}')

        self.max_length = max_length
        self.min_length = min_length
        self.do_sample = do_sample
        self.summarised_docs = {}

    def summarise_docs(self, docs):
        """ 
        max_length (int): Maximum length of the generated summary.
        min_length (int): Minimum length of the generated summary.
        do_sample (bool): Whether to use greedy sampling when generating summaries.
        
        Returns: dict of summaries per ref
        """
        
        for ref, text in docs.items(): 
            try:
                summary = self.summarizer(text, max_length=self.max_length, min_length=self.min_length, do_sample=self.do_sample)
                self.summarised_docs[ref] = summary[0]['summary_text']
            except:
                print(f"Document for '{ref}' is most likely too long")
                continue
        
        return self.summarised_docs
        
    def summary(self, ref):
        """ Return summaries from input ref """
        return print(self.summarised_docs[ref])

# Read Pickles

In [None]:
import os
import pickle

def pickler(path):
    pickles_read = {}
    pickles_unread = []

    for file in os.listdir(path):
        if file.endswith('.pickle'):
            file_path = os.path.join(path, file)
            ref = os.path.splitext(os.path.basename(file_path))[0]
            try:
                with open(file_path, "rb") as data:
                    pickles_read[ref] = pickle.load(data)
            except:
                pickles_unread.append(file)
    
    return pickles_read, pickles_unread
                
path = "C:/Users/Mitch/pickles/"

pickles, empty_pickles = pickler(path)

# Summarise Documents

In [None]:
import itertools

def combine(d):
    tmp = {}
    for ref, docs in d.items():
        ref_text = ' '.join(docs.values())
        tmp[ref] = ref_text
    return tmp

combined_docs = combine(pickles)

n = 10
test_docs = dict(itertools.islice(combined_docs.items(), n))

#for ref, text in combined_docs.items():
#    print(f"Reference '{ref}':\n{text}\n")

In [None]:
summariser = Summariser(max_length=30, min_length=10, do_sample=False)

summarised_docs = summariser.summarise_docs(test_docs)

for ref, summary in summarised_docs.items():
    print(f"Summary for {ref}:\n{summary}")