In [1]:
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM as Ollama
import tiktoken

# QUESTION_TEMPLATE_TXT = """Write a concise summary of the following:

# "{text}"

# CONCISE SUMMARY:"""

# REFINE_TEMPLATE_TXT = """Your job is to produce a final summary.
# We have provided an existing summary up to a certain point: {existing_answer}
# We have the opportunity to refine the existing summary (only if needed) with some more context below.
# ------------
# {text}
# ------------
# Given the new context, refine the original summary.
# If the context isn't useful, return the original summary."""

# MAP_TEMPLATE_TXT = """Write a detail summary of this text section in bullet points. Answer just the bullet points, no other text.
# Text:
# {text}

# SUMMARY:"""
    
# COMBINE_TEMPLATE_TXT = """Combine these summaries into a final summary in bullet points. Answer just the bullet points, no other text.
# Text:
# {text}

# FINAL SUMMARY:"""


# Load a sample text

In [2]:
with open("sample-text.txt", "r") as file:
    transcript = file.read()

In [3]:
from pprint import pprint

In [4]:
pprint(transcript[:500])

("think it's possible that physics has exploits and we should be trying to "
 'find them arranging some kind of a crazy quantum mechanical system that '
 'somehow gives you buffer overflow somehow gives you a rounding error in the '
 'floating Point synthetic intelligences are kind of like the next stage of '
 "development and I don't know where it leads to like at some point I suspect "
 'the universe is some kind of a puzzle these synthetic AIS will uncover that '
 'puzzle and solve it the following is a conversation w')


# Helper functions

In [5]:
def get_text_splitter(chunk_size: int, overlap_size: int):
    return RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=overlap_size)

def convert_text_to_tokens(text, encoder="gpt-3.5-turbo"):
    enc = tiktoken.encoding_for_model(encoder)
    return enc.encode(text)



# Prepare docs 

We only used 50000 letters from the `transcript` variable, so we can use the same chunk size for all docs
chunk_size = 1000

In [6]:
transcript_up_to = 50000

chunk_size = 2500 # this is in tokens
overlap_size = 0 # this is in tokens
# model = "llama3.2"
# base_url = "http://localhost:11434"
# temperature = 0.5
# # max_context = 1024
# num_ctx = 1024*2
# num_predict = 128
# map_prompt_txt = MAP_TEMPLATE_TXT
# combine_prompt_text = COMBINE_TEMPLATE_TXT
# question_prompt_txt = QUESTION_TEMPLATE_TXT
# refine_prompt_txt = REFINE_TEMPLATE_TXT 
# chain_type = "map_reduce"

### CODE

docs = [Document(
    page_content=transcript[:transcript_up_to],
    # metadata={"source": url}
)]

text_splitter = get_text_splitter(chunk_size=chunk_size, overlap_size=overlap_size)
split_docs = text_splitter.split_documents(docs)

n_docs = len(split_docs)

print("number of docs: ", n_docs)

# Count characters in each document
char_counts = [len(doc.page_content) for doc in split_docs]
print(f"Characters counts per document: {char_counts}")
print(f"Total Characters across {n_docs} documents: {sum(char_counts)}")
# split_docs
  # Using gpt-3.5-turbo tokenizer as closest approximation for llama models
token_counts = [len(convert_text_to_tokens(doc.page_content)) for doc in split_docs]
print(f"Token counts per document: {token_counts}")
print(f"Total tokens across {n_docs} documents: {sum(token_counts)}")
# Count words in each document
word_counts = [len(doc.page_content.split()) for doc in split_docs]
print(f"Word counts per document: {word_counts}")
print(f"Total words across {n_docs} documents: {sum(word_counts)}")


number of docs:  4
Characters counts per document: [12718, 12370, 12612, 12297]
Total Characters across 4 documents: 49997
Token counts per document: [2478, 2505, 2488, 2395]
Total tokens across 4 documents: 9866
Word counts per document: [2313, 2332, 2314, 2269]
Total words across 4 documents: 9228


# LLM & Functions

In [10]:
model = "llama3.2"
base_url = "http://localhost:11434"
temperature = 0.5
num_ctx = 1024 # number of context length, DEFAULT = 2048
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_ctx
num_predict = 128 # number of tokens to predict, Default: 128, -1 = infinite generation, -2 = fill context
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_predict
# more info: https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#ollamallm

llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        format='',
        verbose=True
)



# if chain_type == "refine":
#     question_prompt = PromptTemplate(
#         template=question_prompt_txt,
#         input_variables=["text"]
#     )
#     refine_prompt = PromptTemplate(
#         template=refine_prompt_txt,
#         input_variables=["existing_answer", "text"]
#     )
#     chain = load_summarize_chain(
#         llm, 
#         chain_type="refine",
#         question_prompt=question_prompt,
#         refine_prompt=refine_prompt,
#         verbose=True
#     )
# if chain_type == "map_reduce":
#     map_prompt = PromptTemplate(
#         template=map_prompt_txt,
#         input_variables=["text"]
#     )

#     combine_prompt = PromptTemplate(
#         template=combine_prompt_text,
#         input_variables=["text"]
#     )
    
#     chain = load_summarize_chain(
#         llm, 
#         chain_type=chain_type,
#         map_prompt=map_prompt,
#         combine_prompt=combine_prompt,
#         verbose=1
#     )

# output = chain.invoke(split_docs)

In [7]:
MAP_TEMPLATE_TXT = """Write a detail summary of this text section in bullet points. Answer just the bullet points, no other text.
Text:
{text}

SUMMARY:"""

full_prompt = PromptTemplate(
    template=MAP_TEMPLATE_TXT,
    input_variables=["text"]
).format(
    text=split_docs[0].page_content
)

print("number of tokens: {}\n".format(len(convert_text_to_tokens(full_prompt))))
pprint(full_prompt)

number of tokens: 2506

('Write a detail summary of this text section in bullet points. Answer just '
 'the bullet points, no other text.\n'
 'Text:\n'
 "think it's possible that physics has exploits and we should be trying to "
 'find them arranging some kind of a crazy quantum mechanical system that '
 'somehow gives you buffer overflow somehow gives you a rounding error in the '
 'floating Point synthetic intelligences are kind of like the next stage of '
 "development and I don't know where it leads to like at some point I suspect "
 'the universe is some kind of a puzzle these synthetic AIS will uncover that '
 'puzzle and solve it the following is a conversation with Andre capothy '
 'previously the director of AI at Tesla and before that at open Ai and '
 'Stanford he is one of the greatest scientists engineers and Educators in the '
 'history of artificial intelligence this is the Lex Friedman podcast to '
 "support it please check out our sponsors and now dear friends here's A

# Case 1: Overflowing the context

**Observe in Ollama that warning is thrown at us, saying "truncating input prompt"**

In [None]:
output = llm.invoke(full_prompt)
pprint(output)

# Case 2: Within the context length

We will increase the context length of Ollama with `num_ctx` to 3072 (1024*3).

In [None]:
model = "llama3.2"
base_url = "http://localhost:11434"
temperature = 0.5
num_ctx = 1024*3 # number of context length, DEFAULT = 2048
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_ctx
num_predict = 128 # number of tokens to predict, Default: 128, -1 = infinite generation, -2 = fill context
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_predict
# more info: https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#ollamallm

llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        format='',
        verbose=True
)

output = llm.invoke(full_prompt)
pprint(output)

In [None]:
outputs = []
for i in range(5):
    print("\n======================= Output {} ==========================".format(i+1))
    _output = llm.invoke(full_prompt)
    pprint(_output)
    outputs.append(_output)
    

# Case 3: Increase prediction length
To increase the chance of AI completing its answer, let's increase the prediction length with `num_pred` to 1024.

In [12]:
model = "llama3.2"
base_url = "http://localhost:11434"
temperature = 0.5
num_ctx = 1024*3 # number of context length, DEFAULT = 2048
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_ctx
num_predict = 1024 # number of tokens to predict, Default: 128, -1 = infinite generation, -2 = fill context
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_predict
# more info: https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#ollamallm

llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        format='',
        verbose=True
)

In [None]:
output = llm.invoke(full_prompt)
pprint(output)

# Compare with GPT-4o-mini

In [10]:
with open("secret/openai_api_key.txt", "r") as file:
    openai_api_key = file.read()

# openai.api_key = openai_api_key

from langchain_openai import ChatOpenAI

chat_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    api_key=openai_api_key,
)

output_gpt = chat_llm.invoke(full_prompt)

In [None]:
pprint(output_gpt.content)

In [None]:
from IPython.display import display, HTML
import markdown

# Convert Markdown to HTML
html_text1 = markdown.markdown("# Ollama \n"+output.replace("•","-"))
html_text2 = markdown.markdown("# GPT-4o-mini \n"+output_gpt.content)

# Combine into side-by-side layout
html_content = f"""
<div style="display: flex; justify-content: space-between; width: 100%;">
    <div style="width: 48%; padding: 10px; border: 1px solid black; overflow-y: auto; height: 400px;">
        {html_text1}
    </div>
    <div style="width: 48%; padding: 10px; border: 1px solid black; overflow-y: auto; height: 400px;">
        {html_text2}
    </div>
</div>
"""

# Display the HTML
display(HTML(html_content))


# "The longer, the better" is not true in this case

I tried approximately maximizing the use of context length, but the result is suboptical considering text output and consumed time.

- Relatively the same output quality.
- It took 10 minutes (on M4Pro).

<img src="assets/ollama-max-context.png" width="600"/>

# Test variations of `num_predict`

In [51]:
num_predicts = [128, 256, 512, 1024, -1, -2]
outputs = []

for i, num_predict in enumerate(num_predicts):
    llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=0.5,
        num_ctx=1024*3,
        num_predict=num_predict
    )
    output = llm.invoke(full_prompt)
    outputs.append(output)

In [19]:
from langchain_ollama import ChatOllama

num_predicts = [128, 256, 512, -1, -2]
outputs = []

for i, num_predict in enumerate(num_predicts):
    llm = ChatOllama(
        model=model,
        base_url=base_url,
        temperature=0.9,
        num_ctx=1024*3,
        num_predict=num_predict
    )
    messages = [
        { "role": "user", "content": full_prompt},
    ]
    output = llm.invoke(messages)
    outputs.append(output)
    print(f"\n+++++ OUTPUT {i} +++++\n")
    pprint(outputs[i].content)


+++++ OUTPUT 0 +++++

('• A neural network is a mathematical abstraction of the brain, originally '
 'developed at the end of the day as a simple mathematical expression with '
 'knobs that can be adjusted to make it learn and perform tasks.\n'
 '\n'
 '• Despite being mathematically simple, neural networks exhibit surprising '
 'emergent behavior, such as next-word prediction in massive data sets.\n'
 '\n'
 '• The optimization process that gives rise to neural networks is different '
 "from the brain's optimization process, which involved multi-agent self-play "
 'and evolution over millions of years.\n'
 '\n'
 '• Biological neural networks are still not fully understood, but their '
 'optimization process is thought to be constrained by survival and '
 'reproduction needs.\n'
 '\n'
 '• The origin of life itself')

+++++ OUTPUT 1 +++++

('• Andre Capathi explains that neural networks are mathematical abstractions '
 'of the brain, originally developed as a simple mathematical expressi

In [26]:
import pandas as pd

df = pd.DataFrame({
    'Output Index': range(len(outputs)),
    'Num Predict': num_predicts,
    'Token Count': [len(convert_text_to_tokens(output.content)) for output in outputs]
})

# Display the DataFrame
print(df)

   Output Index  Num Predict  Token Count
0             0          128          128
1             1          256          209
2             2          512          226
3             3           -1          208
4             4           -2          242
