In [1]:
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaLLM as Ollama
import tiktoken

# Load a sample text

In [2]:
with open("sample-text.txt", "r") as file:
    transcript = file.read()

In [3]:
from pprint import pprint

In [4]:
pprint(transcript[:500])

("think it's possible that physics has exploits and we should be trying to "
 'find them arranging some kind of a crazy quantum mechanical system that '
 'somehow gives you buffer overflow somehow gives you a rounding error in the '
 'floating Point synthetic intelligences are kind of like the next stage of '
 "development and I don't know where it leads to like at some point I suspect "
 'the universe is some kind of a puzzle these synthetic AIS will uncover that '
 'puzzle and solve it the following is a conversation w')


# Helper functions

In [5]:
def get_text_splitter(chunk_size: int, overlap_size: int):
    return RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=overlap_size)

def convert_text_to_tokens(text, encoder="gpt-3.5-turbo"):
    enc = tiktoken.encoding_for_model(encoder)
    return enc.encode(text)



# Prepare docs 

We only used 50000 letters from the `transcript` variable, so we can use the same chunk size for all docs
chunk_size = 1000

In [6]:
transcript_up_to = 50000

chunk_size = 2500 # this is in tokens
overlap_size = 0 # this is in tokens

### CODE

docs = [Document(
    page_content=transcript[:transcript_up_to],
    # metadata={"source": url}
)]

text_splitter = get_text_splitter(chunk_size=chunk_size, overlap_size=overlap_size)
split_docs = text_splitter.split_documents(docs)

n_docs = len(split_docs)

print("number of docs: ", n_docs)

# Count characters in each document
char_counts = [len(doc.page_content) for doc in split_docs]
print(f"Characters counts per document: {char_counts}")
print(f"Total Characters across {n_docs} documents: {sum(char_counts)}")
# split_docs
  # Using gpt-3.5-turbo tokenizer as closest approximation for llama models
token_counts = [len(convert_text_to_tokens(doc.page_content)) for doc in split_docs]
print(f"Token counts per document: {token_counts}")
print(f"Total tokens across {n_docs} documents: {sum(token_counts)}")
# Count words in each document
word_counts = [len(doc.page_content.split()) for doc in split_docs]
print(f"Word counts per document: {word_counts}")
print(f"Total words across {n_docs} documents: {sum(word_counts)}")


number of docs:  4
Characters counts per document: [12718, 12370, 12612, 12297]
Total Characters across 4 documents: 49997
Token counts per document: [2478, 2505, 2488, 2395]
Total tokens across 4 documents: 9866
Word counts per document: [2313, 2332, 2314, 2269]
Total words across 4 documents: 9228


# LLM & Functions

In [7]:
model = "llama3.2"
base_url = "http://localhost:11434"
temperature = 0.5
num_ctx = 1024 # number of context length, DEFAULT = 2048
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_ctx
num_predict = 128 # number of tokens to predict, Default: 128, -1 = infinite generation, -2 = fill context
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_predict
# more info: https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#ollamallm

llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        format='',
        verbose=True
)


In [8]:
MAP_TEMPLATE_TXT = """Write a detail summary of this text section in bullet points. Answer just the bullet points, no other text.
Text:
{text}

SUMMARY:"""

full_prompt = PromptTemplate(
    template=MAP_TEMPLATE_TXT,
    input_variables=["text"]
).format(
    text=split_docs[0].page_content
)

print("number of tokens: {}\n".format(len(convert_text_to_tokens(full_prompt))))
pprint(full_prompt)

number of tokens: 2506

('Write a detail summary of this text section in bullet points. Answer just '
 'the bullet points, no other text.\n'
 'Text:\n'
 "think it's possible that physics has exploits and we should be trying to "
 'find them arranging some kind of a crazy quantum mechanical system that '
 'somehow gives you buffer overflow somehow gives you a rounding error in the '
 'floating Point synthetic intelligences are kind of like the next stage of '
 "development and I don't know where it leads to like at some point I suspect "
 'the universe is some kind of a puzzle these synthetic AIS will uncover that '
 'puzzle and solve it the following is a conversation with Andre capothy '
 'previously the director of AI at Tesla and before that at open Ai and '
 'Stanford he is one of the greatest scientists engineers and Educators in the '
 'history of artificial intelligence this is the Lex Friedman podcast to '
 "support it please check out our sponsors and now dear friends here's A

# Case 1: Overflowing the context

**Observe in Ollama that warning is thrown at us, saying "truncating input prompt"**

In [9]:
output = llm.invoke(full_prompt)
pprint(output)

('The speaker is discussing the origin of life on Earth and its potential '
 'implications for the existence of intelligent alien civilizations. They '
 'suggest that the emergence of complex intelligence may be a rare event, but '
 'not necessarily unique to humans.\n'
 '\n'
 'Key points:\n'
 '\n'
 '* The origin of life on Earth may have been more common than previously '
 'thought.\n'
 '* Basic chemistry can lead to the emergence of complex life forms under the '
 'right conditions.\n'
 '* The speaker is interested in understanding how common intelligent '
 "societies are in the universe and why we haven't encountered any yet.\n"
 '* They believe that what we did on Earth, creating a technological society, '
 'may not be as difficult as it seems.\n'
 '\n'
 'The')


# Case 2: Within the context length

We will increase the context length of Ollama with `num_ctx` to 3072 (1024*3).

In [10]:
model = "llama3.2"
base_url = "http://localhost:11434"
temperature = 0.5
num_ctx = 1024*3 # number of context length, DEFAULT = 2048
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_ctx
num_predict = 128 # number of tokens to predict, Default: 128, -1 = infinite generation, -2 = fill context
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_predict
# more info: https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#ollamallm

llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        format='',
        verbose=True
)

output = llm.invoke(full_prompt)
pprint(output)

('• A conversation between Andre Capathi, previously director of AI at Tesla '
 'and OpenAI, and Lex Friedman about neural networks.\n'
 '• Neural networks are a mathematical abstraction of the brain and use simple '
 'mathematical expressions with knobs that can be trained to learn complex '
 'behaviors.\n'
 '• Despite being simple mathematically, neural networks produce surprising '
 'emergent behaviors when used for tasks like next word prediction in massive '
 'data sets.\n'
 '• The conversation touches on the idea that optimizing neural networks '
 'allows them to discover interesting solutions to problems.\n'
 '• Capathi mentions GPT-like models and their ability to solve problems by '
 'prompting them with arbitrary inputs.\n'
 '• He also discusses the biological side of')


In [11]:
outputs = []
for i in range(5):
    print("\n======================= Output {} ==========================".format(i+1))
    _output = llm.invoke(full_prompt)
    pprint(_output)
    outputs.append(_output)
    


('• Neural networks are mathematical abstractions of the brain, originally '
 'developed as a simple mathematical expression with knobs (synapses) that can '
 'be trained and modified.\n'
 '• They seem to surprise us with their power and emergent behaviors, despite '
 'being mathematically simple.\n'
 '• The optimization process used to train neural networks is different from '
 'the biological processes that give rise to the brain.\n'
 '• Biological neural networks are part of a multi-agent system that has '
 'evolved over time through evolution and self-play.\n'
 '• The origin of intelligence in humans may be considered a punctuated '
 'equilibrium event, with sparse leaps in progress.\n'
 "• It's difficult to determine if humans are unique")

('• A conversation with Andre Capathi, previously director of AI at Tesla and '
 'OpenAI, Stanford, discusses neural networks.\n'
 '• Neural networks are a mathematical abstraction of the brain, using simple '
 'mathematical expressions with k

# Case 3: Increase prediction length
To increase the chance of AI completing its answer, let's increase the prediction length with `num_pred` to 1024.

In [12]:
model = "llama3.2"
base_url = "http://localhost:11434"
temperature = 0.5
num_ctx = 1024*3 # number of context length, DEFAULT = 2048
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_ctx
num_predict = 1024 # number of tokens to predict, Default: 128, -1 = infinite generation, -2 = fill context
# https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#langchain_ollama.llms.OllamaLLM.num_predict
# more info: https://python.langchain.com/api_reference/ollama/llms/langchain_ollama.llms.OllamaLLM.html#ollamallm

llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        format='',
        verbose=True
)

In [13]:
output = llm.invoke(full_prompt)
pprint(output)

('• A conversation with Andre Capathi, previously director of AI at Tesla and '
 'OpenAI, discusses neural networks.\n'
 '• Neural networks are a mathematical abstraction of the brain, using a '
 'sequence of matrix multiplies and nonlinearities to learn and classify '
 'images.\n'
 '• The knobs in neural networks are trainable and modifiable, allowing for '
 'optimization and emergent behavior.\n'
 '• Large-scale training can lead to surprising magical properties in these '
 'networks.\n'
 '• GPT-like models have been trained on massive datasets and can solve '
 'problems with remarkable consistency.\n'
 '• Andre Capathi emphasizes that the optimization process giving rise to '
 "neural networks is different from the brain's optimization process.\n"
 '• He notes that biological neural networks are complex and multi-agent '
 'systems, whereas artificial neural networks are simpler mathematical '
 'expressions.\n'
 '• The origin of life and emergence of intelligence are seen as key ques

# Compare with GPT-4o-mini

In [14]:
with open("../secret/openai_api_key.txt", "r") as file:
    openai_api_key = file.read()

# openai.api_key = openai_api_key

from langchain_openai import ChatOpenAI

chat_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    api_key=openai_api_key,
)

output_gpt = chat_llm.invoke(full_prompt)

In [15]:
pprint(output_gpt.content)

('- The text discusses the potential exploits in physics and the idea of '
 'synthetic intelligences uncovering universal puzzles.\n'
 '- It features a conversation with Andre Karpathy, former director of AI at '
 'Tesla and OpenAI, on the nature of neural networks.\n'
 '- Neural networks are described as mathematical abstractions of the brain, '
 'primarily consisting of matrix multiplications and nonlinearities.\n'
 '- The "knobs" in neural networks are likened to synapses in the brain and '
 'are adjustable to achieve desired outcomes, such as image classification.\n'
 '- Despite their mathematical simplicity, neural networks can exhibit '
 'surprising emergent behaviors when trained on complex problems.\n'
 '- The discussion touches on the generative capabilities of models like GPT, '
 'which predict the next word based on large datasets.\n'
 '- Karpathy expresses skepticism about drawing direct analogies between '
 'artificial neural networks and biological brains due to different

In [16]:
from IPython.display import display, HTML
import markdown

# Convert Markdown to HTML
html_text1 = markdown.markdown("# Ollama \n"+output.replace("•","-"))
html_text2 = markdown.markdown("# GPT-4o-mini \n"+output_gpt.content)

# Combine into side-by-side layout
html_content = f"""
<div style="display: flex; justify-content: space-between; width: 100%;">
    <div style="width: 48%; padding: 10px; border: 1px solid black; overflow-y: auto; height: 400px;">
        {html_text1}
    </div>
    <div style="width: 48%; padding: 10px; border: 1px solid black; overflow-y: auto; height: 400px;">
        {html_text2}
    </div>
</div>
"""

# Display the HTML
display(HTML(html_content))


# "The longer, the better" is not true in this case

I tried approximately maximizing the use of context length, but the result is suboptical considering text output and consumed time.

- Relatively the same output quality.
- It took 10 minutes (on M4Pro).

<img src="assets/ollama-max-context.png" width="600"/>

# Test variations of `num_predict`

In [17]:
num_predicts = [128, 256, 512, 1024, -1, -2]
outputs = []

for i, num_predict in enumerate(num_predicts):
    llm = Ollama(
        model=model,
        base_url=base_url,
        temperature=0.5,
        num_ctx=1024*3,
        num_predict=num_predict
    )
    output = llm.invoke(full_prompt)
    outputs.append(output)

In [18]:
from langchain_ollama import ChatOllama

num_predicts = [128, 256, 512, -1, -2]
outputs = []

for i, num_predict in enumerate(num_predicts):
    llm = ChatOllama(
        model=model,
        base_url=base_url,
        temperature=0.9,
        num_ctx=1024*3,
        num_predict=num_predict
    )
    messages = [
        { "role": "user", "content": full_prompt},
    ]
    output = llm.invoke(messages)
    outputs.append(output)
    print(f"\n+++++ OUTPUT {i} +++++\n")
    pprint(outputs[i].content)


+++++ OUTPUT 0 +++++

('• A conversation is being held with Andre Capathi, previously director of AI '
 'at Tesla and OpenAI, Stanford.\n'
 '• Neural networks are described as a mathematical abstraction of the brain, '
 'using simple math expressions with many knobs (similar to synapses) that can '
 'be trained and modified.\n'
 '• Despite their simplicity, neural networks exhibit surprising emergent '
 'behavior, particularly when trained on complex problems.\n'
 '• Capathi suggests that optimizing these networks allows them to discover '
 'interesting solutions and "magical properties".\n'
 '• The conversation touches on the topic of artificial intelligence '
 'surpassing human capabilities, with Capathi expressing cautious optimism '
 "about the field's potential.\n"
 '• Capathi also discusses')

+++++ OUTPUT 1 +++++

('• Neural networks are mathematical abstractions of the brain, developed from '
 'a simple sequence of matrix multiplies and nonlinearities, with many knobs '
 '(par

In [19]:
import pandas as pd

df = pd.DataFrame({
    'Output Index': range(len(outputs)),
    'Num Predict': num_predicts,
    'Token Count': [len(convert_text_to_tokens(output.content)) for output in outputs]
})

# Display the DataFrame
print(df)

   Output Index  Num Predict  Token Count
0             0          128          128
1             1          256          213
2             2          512          174
3             3           -1          179
4             4           -2          202
