# Load Summarisation Pipeline

# Method 1: Chunking

In [None]:
from transformers import pipeline

class Summariser:

    def __init__(self, max_length, min_length, chunk_size, type=False, do_sample=False):
        """ Summarises text inputs """

        if type:
            trained = type
        else:
            trained = 'xsum'
        self.summarizer = pipeline('summarization', model=f'facebook/bart-large-{trained}')

        self.max_length = max_length
        self.min_length = min_length
        self.do_sample = do_sample
        self.chunk_size = chunk_size
        self.summarised_docs = {}

    def summarise_docs(self, docs):
        """ 
        max_length (int): Maximum length of the generated summary.
        min_length (int): Minimum length of the generated summary.
        do_sample (bool): Whether to use greedy sampling when generating summaries.
        chunk_size (int): Max size of chunk if need to summarise chunks seperately becuase input too large.
        
        Returns: dict of summaries per ref
        """
        
        for ref, keys in docs.items():
            # combine text
            text = self.combine(keys)
            
            # if extra information exists
            if len(keys) > 2:
            
                # If chunking is enabled and text is too long
                if self.chunk_size and len(text) > self.chunk_size:
                    chunks = self.chunk_text(text, self.chunk_size)
                    summary_text = ""
                    for chunk in chunks:
                        try:
                            summary = self.summarizer(chunk, max_length=self.max_length, min_length=self.min_length, do_sample=self.do_sample)
                            summary_text += summary[0]['summary_text'] + "\n"
                        except:
                            print(f"Chunked document for '{ref}' is too long")
                            continue
                    self.summarised_docs[ref] = summary_text.strip()
                    print(f"{ref} summarised in chunks")
                    
                # summarise as normal if extra information not long
                else:
                    try:
                        summary = self.summarizer(text, max_length=self.max_length, min_length=self.min_length, do_sample=self.do_sample)
                        self.summarised_docs[ref] = summary[0]['summary_text']
                        print(f"{ref} summarised")
                    except:
                        print(f"Error occured while processing {ref}!!!")
                        continue
            # don't summarise title and short desc
            else:
                self.summarised_docs[ref] = text
                print(f"{ref} not summarised")
        
        return self.summarised_docs
        
    def summary(self, ref):
        """ Return summaries from input ref """
        return print(self.summarised_docs[ref])
    
    def combine(self, keys):
        text = ""
        for key, text in keys.items():
            if text is not None:
                text += text
        return text
    
    def chunk_text(self, text, chunk_size):
        """ Split long text into chunks of specified size to summarise seperately """
        chunks = []
        start = 0
        while start < len(text):
            end = start + chunk_size
            chunk = text[start:end]
            chunks.append(chunk)
            start = end
        return chunks

# Method 2: Extract relevant text

In [None]:
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

class Summariser:

    def __init__(self, max_length, min_length, type=False, do_sample=False):
        """ Summarises text inputs """

        if type:
            trained = type
        else:
            trained = 'xsum'
        self.summarizer = pipeline('summarization', model=f'facebook/bart-large-{trained}')

        self.max_length = max_length
        self.min_length = min_length
        self.do_sample = do_sample
        self.summarised_docs = {}

    def summarise_docs(self, docs):
        """ 
        max_length (int): Maximum length of the generated summary.
        min_length (int): Minimum length of the generated summary.
        do_sample (bool): Whether to use greedy sampling when generating summaries.
        
        Returns: dict of summaries per ref
        """
        
        for ref, keys in docs.items():
            # combine text
            text = self.combine(keys)
            
            # if extra information exists
            if len(keys) > 2:
                
                # TF-IDF and cosine similarity to find relevant sentences
                short_desc = keys["DESCRIPTION"]
                relevant_sentences = self.extract_relevant_sentences(short_desc, sent_tokenize(text))

                # combine relevant sentences
                relevant_text = ' '.join(relevant_sentences)
                
                print(f"SHORT DESCRIPTION = {short_desc}")
                print(f"RELEVANT EXTRACT = {relevant_text}")

                # If extra information exists
                if relevant_text:
                    try:
                        summary = self.summarizer(relevant_text, max_length=self.max_length, min_length=self.min_length, do_sample=self.do_sample)
                        self.summarised_docs[ref] = summary[0]['summary_text']
                        print(f"{ref} summarised")
                    except:
                        print(f"Document for '{ref}' is most likely too long")
                        continue

            # don't summarise if no relevant text is found
            else:
                self.summarised_docs[ref] = text
                print(f"{ref} not summarised")
        
        return self.summarised_docs
        
    def summary(self, ref):
        """ Return summaries from input ref """
        return print(self.summarised_docs[ref])
    
    def extract_relevant_sentences(self, short_desc, text_list):
        vectorizer = TfidfVectorizer()
        short_desc_vectors = vectorizer.fit_transform([short_desc])
        full_text_vectors = vectorizer.transform(text_list)

        cosine_similarities = cosine_similarity(short_desc_vectors, full_text_vectors)

        # sort sentences by cosine similarity and select top n
        num_sentences_to_select = min(10, len(text_list))  # selecting top 10 to be summarised
        selected_indices = cosine_similarities.argsort()[0][-num_sentences_to_select:]

        relevant_sentences = [text_list[i] for i in selected_indices]

        return relevant_sentences
    
    def combine(self, keys):
        text_combined = "" 
        for key, text in keys.items():
            if text is not None:
                text_combined += text
        return text_combined

# Read Pickles

In [None]:
import os
import pickle

def pickler(path):
    pickles_read = {}
    pickles_unread = []

    for file in os.listdir(path):
        if file.endswith('.pickle'):
            file_path = os.path.join(path, file)
            ref = os.path.splitext(os.path.basename(file_path))[0]
            try:
                with open(file_path, "rb") as data:
                    pickles_read[ref] = pickle.load(data)
            except:
                pickles_unread.append(file)
    
    return pickles_read, pickles_unread

# Summarise Documents

In [None]:
import random

pickle_path = "C:/Users/Mitch/pickles/"
pickles, empty_pickles = pickler(pickle_path)

random_pickle_keys = random.sample(list(pickles.keys()), 10)
random_pickles = {key: pickles[key] for key in random_pickle_keys}

summariser = Summariser(max_length=100, min_length=50, do_sample=False) # can add type='cnn' to change what model trained on
summarised_docs = summariser.summarise_docs(random_pickles)

for ref, summary in summarised_docs.items():
    print(f"Summary for {ref}:\n{summary}")
    print("<============================>")