<a href="https://colab.research.google.com/github/Amirhatamian/NLP/blob/main/LLM%20Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
pip install wikipedia-api




In [46]:
import nltk
from nltk.tokenize import sent_tokenize
import wikipediaapi

# Ensure required NLTK data is downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
def fetch_wikipedia_content(subject):
    # Initialize the Wikipedia API with the English language
    wiki_wiki = wikipediaapi.Wikipedia('english')

    # Fetch the Wikipedia page for the given subject
    page_py = wiki_wiki.page(subject)

    # Check if the page exists
    if not page_py.exists():
        print(f"Wikipedia page for '{subject}' does not exist.")
        return None

    # Return the text content of the page
    return page_py.text


In [48]:
def save_content_to_file(content, filename):
    # Check if the content is not None
    if content is None:
        print("No content to save.")
        return

    # Write the content to a file
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"Content saved to {filename}")


In [49]:
def read_content_from_file(filename):
    try:
        # Open the file and read its content
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return None


In [50]:
def summarize_text(text, max_sentences=5):
    """
    Summarize the text to fit within the context window.

    :param text: The text to be summarized.
    :param max_sentences: The maximum number of sentences for the summary.
    :return: The summarized text.
    """
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # If the text is already short enough, return it as is
    if len(sentences) <= max_sentences:
        return text

    # Simple summarization: take the first and last few sentences
    summary = sentences[:max_sentences//2] + sentences[-max_sentences//2:]

    return ' '.join(summary)


In [51]:
def perform_summarization(content, output_filename, context_window_limit):
    """
    Summarize content to fit within the context window limit and save to file.

    :param content: The text content to be summarized.
    :param output_filename: The name of the file to save the summary.
    :param context_window_limit: The maximum word count for the context window.
    :return: The summarized text.
    """
    length = len(content.split())
    target_length = int(length * (context_window_limit / (context_window_limit + 4000)))

    summary_list = []
    start_index = 0
    content_words = content.split()

    while start_index < length:
        end_index = start_index + target_length
        slice_content = " ".join(content_words[start_index:end_index])
        summary_slice = summarize_text(slice_content, max_sentences=target_length//50)
        summary_list.append(summary_slice)
        start_index = end_index

    collated_summary = " ".join(summary_list)

    while len(collated_summary.split()) > context_window_limit:
        collated_summary = summarize_text(collated_summary, max_sentences=context_window_limit//50)

    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(collated_summary)

    return collated_summary


In [52]:
# Main function
def main():
    # Take input from the user for the Wikipedia subjects
    subject1 = input("Enter the first subject: ")
    subject2 = input("Enter the second subject: ")

    # Fetch content from Wikipedia
    content1 = fetch_wikipedia_content(subject1)
    content2 = fetch_wikipedia_content(subject2)

    if content1 and content2:
        # Save content to input files
        save_content_to_file(content1, "input_text1.txt")
        save_content_to_file(content2, "input_text2.txt")

        # Summarize text files
        summarized_content1 = perform_summarization(content1, "summarized_text1.txt", 128)
        summarized_content2 = perform_summarization(content2, "summarized_text2.txt", 128)

        # Generate the query
        query = f"\nDocument 1 summary: {summarized_content1}\n\nDocument 2 summary: {summarized_content2}"

        print(query)

In [53]:
# Run the main function
if __name__ == "__main__":
    main()

Enter the first subject: Natural Language Processing
Enter the second subject: Data Science
Content saved to input_text1.txt
Content saved to input_text2.txt


KeyboardInterrupt: 

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [55]:
def compute_similarity_matrix(sentences):
    """
    Compute the similarity matrix for a list of sentences.

    :param sentences: A list of sentences.
    :return: A similarity matrix.
    """
    # Transform sentences into a matrix of token counts
    vectorizer = CountVectorizer().fit_transform(sentences)

    # Compute cosine similarity between vectors
    similarity_matrix = cosine_similarity(vectorizer)

    return similarity_matrix


In [56]:
def summarize_text_cosine_similarity(text, target_length):
    sentences = sent_tokenize(text)

    if len(sentences) == 0:
        return ""

    # Compute cosine similarity matrix between sentences
    similarity_matrix = compute_similarity_matrix(sentences)

    length = 0
    summary_sentences = []
    selected_indices = set()

    while length < target_length and len(selected_indices) < len(sentences):
        if len(summary_sentences) == 0:
            # If summary is empty, start with the first sentence
            most_similar_index = 0
        else:
            # Find the sentence most similar to the existing summary
            summary_indices = [i for i in range(len(sentences)) if i in selected_indices]
            similarity_scores = similarity_matrix[summary_indices].sum(axis=0)
            similarity_scores[list(selected_indices)] = 0  # Avoid selecting already chosen sentences
            most_similar_index = np.argmax(similarity_scores)

        if most_similar_index in selected_indices:
            break

        # Add the selected sentence to the summary
        summary_sentences.append(sentences[most_similar_index])
        selected_indices.add(most_similar_index)

        # Update the length based on the added sentence
        length += len(sentences[most_similar_index].split())

    return ' '.join(summary_sentences).strip()

In [57]:
def perform_summarization(content, output_filename, context_window_limit):
    length = len(content.split())
    target_length = int(length * (context_window_limit / (context_window_limit + 4000)))

    summary = summarize_text_cosine_similarity(content, target_length)

    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(summary)

    return summary

In [58]:
def main():
    # Reading the content from input files
    contents_1 = read_content_from_file("input_text1.txt")
    contents_2 = read_content_from_file("input_text2.txt")

    if contents_1 and contents_2:
        # Perform summarization on both files
        perform_summarization(contents_1, "Second_summarized_text1.txt", 128)
        perform_summarization(contents_2, "Second_summarized_text2.txt", 128)

        # Generate the query with the summarized content
        query = f"\nDocument 1 summary: {read_content_from_file('Second_summarized_text1.txt')}\n" \
                f"\nDocument 2 summary: {read_content_from_file('Second_summarized_text2.txt')}"

        print(query)

if __name__ == "__main__":
    main()


Document 1 summary: Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. Book generation
Not an NLP task proper but an extension of natural language generation and other NLP tasks is the creation of full-fledged books. Common NLP tasks
The following is a list of some of the most commonly researched tasks in natural language processing. As an example, George Lakoff offers a methodology to build natural language processing (NLP) algorithms through the perspective of cognitive science, along with the findings of cognitive linguistics, with two defining aspects:

Apply the theory of conceptual metaphor, explained by Lakoff as "the understanding of one idea, in terms of another" which provides an idea of the intent of the author. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The difficulty of this task depends greatly on the complexity of