### Input
Set either INPUT_FILE or INPUT_TEXT variable.

In [None]:
#pip install -r requirements.txt

In [1]:
# Provide either INPUT_FILE path or INPUT_TEXT to summarize.
INPUT_FILE="" # Insert file path here
INPUT_TEXT="""Insert text to summarize here."""

# Style of summarization:

# Numbered List style
STYLE="Return your response as numbered list which covers the main points of the text."
PROMPT_TRIGGER="NUMBERED LIST SUMMARY"

# One sentence style
#STYLE="Return your response as one sentence which covers the main points of the text."
#PROMPT_TRIGGER="ONE SENTENCE SUMMARY"

# Concise style
#STYLE="Return your response as concise summary which covers the main points of the text."
#PROMPT_TRIGGER="CONCISE SUMMARY"

# Detailed style
#STYLE="Return your response as detailed summary which covers the main points of the text and key facts and figures."
#PROMPT_TRIGGER="DETAILED SUMMARY"

# Output language, try e.g. Polish, Spanish, etc 
OUTPUT_LANGUAGE = "English"

# Should output verbose info from underlying models, etc.
VERBOSE=True

### Model params & setup

In [2]:
# Model file
MODEL_FILE="./models/mistral-7b-openorca.Q5_K_M.gguf"

MODEL_CONTEXT_WINDOW=8192

# Maximal lenght of model's output, in tokens.
MAX_ANSWER_TOKENS = 2048

# Chunk params in characters (not tokens).
CHUNK_SIZE=10000
CHUNK_OVERLAP=500

In [3]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(
    model_path=MODEL_FILE,
    n_ctx=MODEL_CONTEXT_WINDOW,
    # Maximal lenght of model's output, in tokens.
    max_tokens=MAX_ANSWER_TOKENS,
    # Don't be creative.
    temperature=0,
    verbose=VERBOSE,

    # Remove next two lines if NOT using macOS & M1 processor:
    n_batch=512,
    n_gpu_layers=1,
)

AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


### Implementation

In [4]:
from langchain.document_loaders import TextLoader

def load_content():
    """Loads INPUT_FILE if set, otherwise returns INPUT_TEXT"""

    if INPUT_FILE:
        if INPUT_FILE.endswith(".pdf"):
            loader = PyPDFLoader(INPUT_FILE)
            docs = loader.load()
            print(f"PDF: loaded {len(docs)} pages")
            return "\n".join([d.page_content for d in docs])
        
        docs =  TextLoader(INPUT_FILE).load()
        return docs[0].page_content

    return INPUT_TEXT


In [5]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

combine_prompt_template = """
Write a summary of the following text delimited by tripple backquotes.
{style}

```{content}```

{trigger} in {language}:
"""

map_prompt_template = """
Write a concise summary of the following:
{text}

CONCISE SUMMARY in {language}:
"""

def summarize_base(llm, content):
    """Summarize whole content at once. The content needs to fit into model's context window."""

    prompt = PromptTemplate.from_template(
        combine_prompt_template
    ).partial(
        style=STYLE,
        trigger=PROMPT_TRIGGER,
        language=OUTPUT_LANGUAGE,
    )

    chain = LLMChain(llm=llm, prompt=prompt, verbose=VERBOSE)
    output = chain.run(content)

    return output


def summarize_map_reduce(llm, content):
    """Summarize content potentially larger that model's context window using map-reduce approach."""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
    )

    split_docs = text_splitter.create_documents([content])
    print(f"Map-Reduce content splits ({len(split_docs)} splits): {[len(sd.page_content) for sd in split_docs]}")

    map_prompt = PromptTemplate.from_template(
        map_prompt_template
    ).partial(
        language=OUTPUT_LANGUAGE,
    )
    
    combine_prompt = PromptTemplate.from_template(
        combine_prompt_template
    ).partial(
        style=STYLE,
        trigger=PROMPT_TRIGGER,
        language=OUTPUT_LANGUAGE,
    )

    chain = load_summarize_chain(
        llm=llm,
        chain_type="map_reduce",
        map_prompt=map_prompt,
        combine_prompt=combine_prompt,
        combine_document_variable_name="content",
        verbose=VERBOSE,
    )

    output = chain.run(split_docs)
    return output

### Main program

In [None]:
%%time 

INPUT_TEXT = ""
INPUT_FILE = "text.txt"
style = "D"

if (style == "A"):
    # Numbered List style
    STYLE="Return your response as numbered list which covers the main points of the text."
    PROMPT_TRIGGER="NUMBERED LIST SUMMARY"
elif (style == "B"):
    # One sentence style
    STYLE="Return your response as one sentence which covers the main points of the text."
    PROMPT_TRIGGER="ONE SENTENCE SUMMARY"
elif (style == "C"):
    # Concise style
    STYLE="Return your response as concise summary which covers the main points of the text."
    PROMPT_TRIGGER="CONCISE SUMMARY"
else:
    # Detailed style
    STYLE="Return your response as detailed summary which covers the main points of the text and key facts and figures."
    PROMPT_TRIGGER="DETAILED SUMMARY"


content = load_content()
content_tokens = llm.get_num_tokens(content)
print(f"Content length: {len(content)} chars, {content_tokens} tokens.")
print("Content sample:\n" + content[:200] + "\n\n")

# Keep part of context window for models output.
base_threshold = 0.75*MODEL_CONTEXT_WINDOW

if (content_tokens < base_threshold):
    print("Using summarizer: base")
    summary = summarize_base(llm, content)
else:
    print("Using summarizer: map-reduce")
    summary = summarize_map_reduce(llm, content)

print(f"Content length: {len(summary)} chars, {llm.get_num_tokens(summary)} tokens.")
print("Summary:\n" + summary + "\n\n")


In [6]:
import nbformat
import pandas as pd

# List of notebook filenames
notebook_filenames = [
    'testnotebook/1.ipynb',
    'testnotebook/2.ipynb',
    'testnotebook/3.ipynb',
    'testnotebook/4.ipynb',
    'testnotebook/5.ipynb'
]

# Initialize a dictionary to hold all questions and their answers
questions = set()  # To track all unique questions
data = {filename: {} for filename in notebook_filenames}  # Initialize a dictionary for each notebook

# Loop through each notebook
for notebook_filename in notebook_filenames:
    with open(notebook_filename, 'r') as f:
        nb_contents = nbformat.read(f, as_version=4)

        # Variable to keep track of current question
        current_question = None

        # Iterate through the cells to extract questions and answers
        for cell in nb_contents.cells:
            if cell.cell_type == 'markdown':
                lines = cell.source.split('\n')
                for line in lines:
                    if line.startswith('Q'):  # Assuming questions are prefixed with 'Q:'
                        current_question = line.strip()  # Get the question text
                        questions.add(current_question)  # Add to the set of questions
                        data[notebook_filename][current_question] = None  # Initialize answer
                    elif current_question:
                        answer = line.strip()  # Get the answer text
                        data[notebook_filename][current_question] = answer  # Assign answer to the current question

# Create a DataFrame with all questions as rows and notebooks as columns
df = pd.DataFrame(index=sorted(questions), columns=notebook_filenames)

# Populate the DataFrame with the answers
for notebook_id, answers in data.items():
    for question in answers:
        df.at[question, notebook_id] = answers[question]

colName = []

# Use a for loop to append numbers from 1 to 5
for i in range(1, 6):
    colName.append("Student" + str(i))

# Renaming all columns
df.columns = colName

# Display the DataFrame
print(df)

     Student1   Student2   Student3   Student4 Student5
Q1    Correct    Correct    Correct    Correct  Correct
Q2  Incorrect  Incorrect  Incorrect  Incorrect  Correct
Q3    Correct    Correct    Correct  Incorrect  Correct
Q4  Incorrect    Correct  Incorrect  Incorrect  Correct
Q5  Incorrect    Correct    Correct  Incorrect  Correct


In [17]:
%%time 

INPUT_TEXT = df.to_string(index=True)
INPUT_FILE = ""
style = "E"

if (style == "A"):
    # Numbered List style
    STYLE="Return your response as numbered list which covers the main points of the text."
    PROMPT_TRIGGER="NUMBERED LIST SUMMARY"
elif (style == "B"):
    # One sentence style
    STYLE="Return your response as one sentence which covers the main points of the text."
    PROMPT_TRIGGER="ONE SENTENCE SUMMARY"
elif (style == "C"):
    # Concise style
    STYLE="Return your response as concise summary which covers the main points of the text."
    PROMPT_TRIGGER="CONCISE SUMMARY"
elif (style == "D"):
    # Detailed style
    STYLE="Return your response as detailed summary which covers the main points of the text and key facts and figures."
    PROMPT_TRIGGER="DETAILED SUMMARY"
elif (style == "E"):
    # Detailed style
    STYLE="The input table indicates the student's performance on a question, for example Student1 answer correctly on Q1, but make mistake on Q2. Return your response as report which covers the statistic of students and which question students commonly make mistake on"
    PROMPT_TRIGGER="COMMON MISTAKE"

content = load_content()
content_tokens = llm.get_num_tokens(content)
print(f"Content length: {len(content)} chars, {content_tokens} tokens.")
print("Content sample:\n" + content[:200] + "\n\n")

# Keep part of context window for models output.
base_threshold = 0.75*MODEL_CONTEXT_WINDOW

if (content_tokens < base_threshold):
    print("Using summarizer: base")
    summary = summarize_base(llm, content)
else:
    print("Using summarizer: map-reduce")
    summary = summarize_map_reduce(llm, content)

print(f"Content length: {len(summary)} chars, {llm.get_num_tokens(summary)} tokens.")
print("Summary:\n" + summary + "\n\n")

Content length: 335 chars, 105 tokens.
Content sample:
     Student1   Student2   Student3   Student4 Student5
Q1    Correct    Correct    Correct    Correct  Correct
Q2  Incorrect  Incorrect  Incorrect  Incorrect  Correct
Q3    Correct    Correct    Corr


Using summarizer: base


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Write a summary of the following text delimited by tripple backquotes.
The input table indicates the student's performance on a question, for example Student1 answer correctly on Q1, but make mistake on Q2. Return your response as report which covers the statistic of students and which question students commonly make mistake on

```     Student1   Student2   Student3   Student4 Student5
Q1    Correct    Correct    Correct    Correct  Correct
Q2  Incorrect  Incorrect  Incorrect  Incorrect  Correct
Q3    Correct    Correct    Correct  Incorrect  Correct
Q4  Incorrect    Correct  Incorrect  Incorrect  Correct
Q5  Incorrect    Correc

Llama.generate: prefix-match hit



[1m> Finished chain.[0m
Content length: 363 chars, 132 tokens.
Summary:
1. Student1, Q2
2. Student2, Q2
3. Student3, Q4
4. Student4, Q4
5. Student5, Q5

STATISTICS:
Total Students: 5
Correct Answers: 10
Incorrect Answers: 5
Percentage of Correct Answers: 66.67%
Percentage of Incorrect Answers: 33.33%
```

The report should include the common mistakes made by each student and the overall statistics for correct and incorrect answers.


CPU times: total: 12min 9s
Wall time: 3min 5s
