In [None]:
#pip install -r requirements.txt

In [9]:
# Provide either INPUT_FILE path or INPUT_TEXT to summarize.
INPUT_FILE="" # Insert file path here
INPUT_TEXT="""Insert text to summarize here."""
STYLE=""
PROMPT_TRIGGER=""
# Output language, try e.g. Polish, Spanish, etc 
OUTPUT_LANGUAGE = "English"

# Should output verbose info from underlying models, etc.
VERBOSE=True

In [1]:
# Model file
MODEL_FILE="./models/mistral-7b-openorca.Q5_K_M.gguf"

MODEL_CONTEXT_WINDOW=8192

# Maximal lenght of model's output, in tokens.
MAX_ANSWER_TOKENS = 2048

# Chunk params in characters (not tokens).
CHUNK_SIZE=10000
CHUNK_OVERLAP=500

In [11]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(
    model_path=MODEL_FILE,
    n_ctx=MODEL_CONTEXT_WINDOW,
    # Maximal lenght of model's output, in tokens.
    max_tokens=MAX_ANSWER_TOKENS,
    # Don't be creative.
    temperature=0,
    verbose=VERBOSE,

    # Remove next two lines if NOT using macOS & M1 processor:
    n_batch=512,
    n_gpu_layers=1,
)

AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [12]:
from langchain.document_loaders import TextLoader

def load_content():
    """Loads INPUT_FILE if set, otherwise returns INPUT_TEXT"""

    if INPUT_FILE:
        if INPUT_FILE.endswith(".pdf"):
            loader = PyPDFLoader(INPUT_FILE)
            docs = loader.load()
            print(f"PDF: loaded {len(docs)} pages")
            return "\n".join([d.page_content for d in docs])
        
        docs =  TextLoader(INPUT_FILE).load()
        return docs[0].page_content

    return INPUT_TEXT

In [13]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

combine_prompt_template = """
Write a summary of the following text delimited by tripple backquotes.
{style}

```{content}```

{trigger} in {language}:
"""

map_prompt_template = """
Write a concise summary of the following:
{text}

CONCISE SUMMARY in {language}:
"""

def summarize_base(llm, content):
    """Summarize whole content at once. The content needs to fit into model's context window."""

    prompt = PromptTemplate.from_template(
        combine_prompt_template
    ).partial(
        style=STYLE,
        trigger=PROMPT_TRIGGER,
        language=OUTPUT_LANGUAGE,
    )

    chain = LLMChain(llm=llm, prompt=prompt, verbose=VERBOSE)
    output = chain.run(content)

    return output


def summarize_map_reduce(llm, content):
    """Summarize content potentially larger that model's context window using map-reduce approach."""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )

    split_docs = text_splitter.create_documents([content])
    print(f"Map-Reduce content splits ({len(split_docs)} splits): {[len(sd.page_content) for sd in split_docs]}")

    map_prompt = PromptTemplate.from_template(
        map_prompt_template
    ).partial(
        language=OUTPUT_LANGUAGE,
    )
    
    combine_prompt = PromptTemplate.from_template(
        combine_prompt_template
    ).partial(
        style=STYLE,
        trigger=PROMPT_TRIGGER,
        language=OUTPUT_LANGUAGE,
    )

    chain = load_summarize_chain(
        llm=llm,
        chain_type="map_reduce",
        map_prompt=map_prompt,
        combine_prompt=combine_prompt,
        combine_document_variable_name="content",
        verbose=VERBOSE,
    )

    output = chain.run(split_docs)
    return output

In [41]:
%%time
combine_prompt_template = """
The following text delimited by tripple backquotes is about group of students' performance in an assignment, separated by a line of dashes (----). 
Each student's data includes:
1. The student ID
2. The overall score for the assignment of that student
3. A series of questions, including scores and comments for each question.
Score: X / Y means X marks gained in a Y marks question. A question is answered correctly if X = Y. 
Write a summary of it.
{style}

```
{content}```

{trigger} in {language}:
"""

map_prompt_template = """
The text is about the performance of a student on assignments in a course. Each assignment is separated by a line of dashes (----).
Each assignment's data includes:
1. The assignment name
2. The overall score for the assignment
3. A series of questions, including scores and comments for each question.
Score: X / Y means X marks gained in a Y marks question. A question is answered correctly if X = Y. 

Write a summary of the following text:
{text}

{trigger} in {language}:
"""

STYLE="Return your response as report which covers the statistic of each question. Highlight any question that most students get wrong."
PROMPT_TRIGGER="REPORT"

def summarize(text):
    global INPUT_TEXT
    INPUT_TEXT = text
    
    content = load_content()
    content_tokens = llm.get_num_tokens(content)
    print(f"Content length: {len(content)} chars, {content_tokens} tokens.")
    print("Content sample:\n" + content[:200] + "\n\n")
    
    # Keep part of context window for models output.
    base_threshold = 0.75*MODEL_CONTEXT_WINDOW
    
    if (content_tokens < base_threshold):
        print("Using summarizer: base")
        summary = summarize_base(llm, content)
    else:
        print("Using summarizer: map-reduce")
        summary = summarize_map_reduce(llm, content)
    
    print(f"Content length: {len(summary)} chars, {llm.get_num_tokens(summary)} tokens.")
    print("Summary:\n" + summary + "\n\n")
    result = "Summary:\n" + summary + "\n\n"
    return result