Notebook for generating summaries

In [None]:
from ipynb.fs.defs.a_preprocess_data import get_documents_from_files 
from ipynb.fs.defs.a_preprocess_data import split_docs 
from ipynb.fs.defs.a_preprocess_data import clean_and_process_chunked_documents 

In [1]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chat_models import ChatOpenAI

def generate_summary(doc: str):
    """
    Uses GPT 3.5 Turbo to generate a summary based on a given context.
    """

    llm = ChatOpenAI(model_name="gpt-3.5-turbo-0613", temperature=0)

    system_message_prompt = SystemMessagePromptTemplate.from_template("""You are a helpful assistant that generates a one sentence summary of for the given context.""")
    human_message_prompt = HumanMessagePromptTemplate.from_template("""Generate a summary related to this context. Output exactly one sentence that is a maximum of 30 words. \n \n Context: {context}""")
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    generated_summary = llm(chat_prompt.format_prompt(context=doc).to_messages()).content
    
    return generated_summary

In [35]:
from typing import List
import json

def save_summaries(summaries: List[str], path: str):
    """
    Saves the summaries to the specified path.
    """
    file_path = f"./../../evaluationInput/retrieval_eval/{path}.json"
    try:
        with open(file_path, 'w') as json_file:
            json.dump(summaries, json_file)
        print(f"Array saved successfully to {file_path}")

    except Exception as e:
        print(f"Error saving the array to {file_path}: {e}")

In [82]:
def create_summaries_of_documents(path: str, file_type, all_docs, chunk_size, chunk_overlap):
    """
    Uses the above methods to generated summaries for all specified document types and chunk sizes and overlaps.
    """    
    documents = get_documents_from_files(file_type, all_docs)
    chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents, False)

    documents = []
    for doc in chunked_cleaned_documents:
        documents.append(doc.page_content)

    summaries = []

    try:
        with open(f"./../../evaluationInput/retrieval_eval/{path}.json", 'r') as json_file:
            summaries = json.load(json_file)
    except Exception as e:
        print(f"Error loading the array from {path}: {e}")
        return

    for index, doc in enumerate(documents):
        summary = generate_summary(doc)
        summaries.append(summary)  

        if index % 100 == 0:
            save_summaries(summaries, path)    

    save_summaries(summaries, path)

In [None]:
create_summaries_of_documents("summaries_1536_264", "All", True, 1536, 264)