In [1]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def generate_query(summary):
    tokens = word_tokenize(summary)
    pos_tags = pos_tag(tokens)
    query_terms = [word for word, pos in pos_tags if pos.startswith('NN') or pos.startswith('JJ')]
    query = " ".join(query_terms)

    return query

def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()
    return [" ".join([ps.stem(word.lower()) for word in word_tokenize(sentence) if word.isalnum() and word.lower() not in stop_words]) for sentence in sent_tokenize(text)]

def generate_summary(document, target_length):
    sentences = preprocess_text(document)
    summary = []
    current_length = 0
    for sentence in sentences:
        words_count = len(sentence.split())
        if current_length + words_count <= target_length:
            summary.append(sentence)
            current_length += words_count
        else:
            break

    return " ".join(summary)


def hierarchical_summarization_with_query(doc1, doc2):
    total_length = len(doc1.split()) + len(doc2.split())
    target_length_doc1 = int(len(doc1.split()) / total_length * 128)
    target_length_doc2 = 128 - target_length_doc1
    collated_summary = ""

    while len(collated_summary.split()) < target_length_doc2 and doc2:
        slice_text = doc2[:target_length_doc2 - len(collated_summary.split())]
        summary = generate_summary(slice_text, target_length_doc2)
        collated_summary += summary
        doc2 = doc2[len(slice_text):]

    final_summary = generate_summary(collated_summary, 128)
    with open("final_summary.txt", "w") as file:
        file.write(final_summary)
    generated_query = generate_query(final_summary)
    return final_summary, generated_query


document1 = input("Enter your first document text: ")
document2 = input("Enter your second document text: ")

result_summary, generated_query = hierarchical_summarization_with_query(document1, document2)

print("\nFinal Summary saved to 'final_summary.txt'.\n", result_summary)
print("\nGenerated Query:", generated_query)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Legion\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Legion\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Legion\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Enter your first document text: Large Language Models can be trained for a number of purposes. In this assignment, candidates are either required to (CASE 1) implement an algorithm to generate summarization of an input text following the style of another text given as input in the context window. The implementation is to be either in Python, while using NLTK, or in Java while using OpenNLP.  A text is given "as it is" when the length is below the context window limit (128 Mb or 4000 tokens, or other depending on the chosen LLM). When it exceeds the window, then the algorithm should provide a summary with an hierarchical approach:  The pipeline will be   * Measure the length of the two documents; * Compute the target lengths in a proportional way with respect to the length of the documents; * Slice the second document from start to a point within the context window; * Summarize the slice with no request for size of the target; * Repeat the previous two steps until the end of the documen