In [1]:
# langchain packages

# pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph

Note: you may need to restart the kernel to use updated packages.


# packages

In [None]:
pip install mistralai numpy faiss-cpu python-dotenv sentence-transformers langchain httpx

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install -U pip setuptools wheel 


Note: you may need to restart the kernel to use updated packages.


In [9]:
# For local development
!pip install -U 'spacy[apple]'

# For colab
#pip install -U spacy

Collecting thinc-apple-ops<2.0.0,>=1.0.0 (from spacy[apple])
  Downloading thinc_apple_ops-1.0.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting numpy>=1.19.0 (from spacy[apple])
  Using cached numpy-2.2.1-cp310-cp310-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading thinc_apple_ops-1.0.0-cp310-cp310-macosx_11_0_arm64.whl (156 kB)
Using cached numpy-2.2.1-cp310-cp310-macosx_14_0_arm64.whl (5.4 MB)
Installing collected packages: numpy, thinc-apple-ops
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.14 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.2.1 which is incompatible.
langchain-community 0.3.14 requires numpy<2,>=1.22.4; python_version < "3.12", bu

In [None]:
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_sm

# code

In [2]:
from mistralai import Mistral
import requests
import numpy as np
import faiss
import os
from getpass import getpass
from dotenv import load_dotenv
import spacy
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

OPENAI_KEY = os.getenv('OPENAI_KEY')

In [3]:
mistral_api_key= os.getenv('MISTRAL_KEY')
mistral_client = Mistral(api_key=mistral_api_key)

In [4]:
text = ''

# If text is not present, download it. Else use predownloaded text
if not os.path.exists('./sources/paul_graham_essay.txt'):
  response = requests.get('https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt')
  text = response.text
  
  f = open('./sources/graham_essay.txt', 'w')
  f.write(text)
  f.close()
else:
  with open('./sources/paul_graham_essay.txt', 'r') as f:
    text = f.read()

In [8]:
text

'\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch cards, then s

In [48]:
chunk_size = 2048
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
len(chunks)

37

In [92]:
for chunk in chunks:
  print(f'chunk: {chunk}')

chunk: 

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.

The language we used was an early version of Fortran. You had to type programs on punch cards, then stack th

generate embeddings

# My experiments

## General function

In [7]:
nlp_lg = spacy.load("en_core_web_lg")
nlp_sm = spacy.load("en_core_web_sm")

In [135]:
def get_prompt(retrieved_chunk, question):
  return f"""
Context information is below.
---------------------
{retrieved_chunk}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {question}
Answer:
"""

In [10]:
question = "What were the two main things the author worked on before college?"

In [141]:
def run_mistral(user_message, model="mistral-large-latest"):
    messages = [
        {
            "role": "user", "content": user_message
        }
    ]
    chat_response = mistral_client.chat.complete(
        model=model,
        messages=messages
    )
    return (chat_response.choices[0].message.content)

In [7]:
doc = nlp_lg(text)

## Experiment with getting embeddings of chunks

In [31]:
def get_text_embedding_with_spacy(input):
    doc = nlp_lg(input)
    embedding = doc.vector
    
    return embedding

In [34]:
text_embeddings = np.array([get_text_embedding_with_spacy(chunk) for chunk in chunks])

In [35]:

d = text_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(text_embeddings)

In [38]:
def get_closest_chunk(question, chunks, index):
  question_embeddings = np.array([get_text_embedding_with_spacy(question)])
  D, I = index.search(question_embeddings, k=2) # distance, index
  retrieved_chunk = [chunks[i] for i in I.tolist()[0]]

  return retrieved_chunk



In [39]:
retrieved_chunk = get_closest_chunk(question, chunks, index)

print(retrieved_chunk)

['n out to like a lot: a woman called Jessica Livingston. A couple days later I asked her out.\n\nJessica was in charge of marketing at a Boston investment bank. This bank thought it understood startups, but over the next year, as she met friends of mine from the startup world, she was surprised how different reality was. And how colorful their stories were. So she decided to compile a book of interviews with startup founders.\n\nWhen the bank had financial problems and she had to fire half her staff, she started looking for a new job. In early 2005 she interviewed for a marketing job at a Boston VC firm. It took them weeks to make up their minds, and during this time I started telling her about all the things that needed to be fixed about venture capital. They should make a larger number of smaller investments instead of a handful of giant ones, they should be funding younger, more technical founders instead of MBAs, they should let the founders remain as CEO, and so on.\n\nOne of my 

In [84]:
question_embeddings = np.array([get_text_embedding(question)])

In [85]:
D, I = index.search(question_embeddings, k=2) # distance, index
retrieved_chunk = [chunks[i] for i in I.tolist()[0]]

In [132]:


run_mistral(prompt)

NameError: name 'prompt' is not defined

## Experiment with embeddings without chunking

In [33]:
doc = nlp_sm(text)

embeddings = np.array([token.vector for token in doc])

In [79]:
embeddings_d = embeddings.shape[1]
embeddings_index = faiss.IndexFlatL2(embeddings_d)
embeddings_index.add(embeddings)

## Experiment with sentences embeddings

In [113]:
sentences_embeddings = np.array([sent.vector for sent in doc.sents])

In [119]:
d = sentences_embeddings.shape[1]
sentences_index = faiss.IndexFlatL2(d)
sentences_index.add(sentences_embeddings)

In [126]:
D, I = sentences_index.search(question_embeddings, k=10)
retrieved_chunk = [list(doc.sents)[i] for i in I.tolist()[0]]

In [127]:
retrieved_chunk

[Well, how had I chosen what to work on in the past?,
 What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn't much left for these supposed ultimate truths.,
 But for the first few years I was still able to work on other things.
 ,
 One of the most conspicuous patterns I've noticed in my life is how well it has worked, for me at least, to work on things that weren't prestigious.,
 [12]
 
 I've worked on several different things, but to the extent there was a turning point where I figured out what to work on, it was when I started publishing essays online.,
 
 
 What I Worked On
 
 February 2021
 
 Before college the two main things I worked on, outside of school, were writing and programming.,
 One day in late 1994 as I was stretching one of these monsters there was something on the radio about a famous fund manager.,
 So I gave this talk, in the course of which I told them that the best sources of seed funding were succ

In [129]:
prompt = get_prompt(retrieved_chunk, question)

run_mistral(prompt)

'The two main things the author worked on before college were **writing** and **programming**.'

In [105]:
sentences = list(doc.sents)

# Experiments

## Comparison of relevance of spacy and openai embeddings

I'm experimenting with different libraries for embeddings: SPAcy, OpenAi 

### Retrieval with spacy embeddings

In [97]:
def get_embedding_with_spacy(input):
    doc = nlp_lg(input)
    embedding = doc.vector
    
    return embedding

In [43]:
spacy_sentence_embeddings = np.array([sent.vector for sent in doc.sents])

d = spacy_sentence_embeddings.shape[1]
spacy_sentence_embeddings_index = faiss.IndexFlatL2(d)
spacy_sentence_embeddings_index.add(spacy_sentence_embeddings)



In [123]:
question_embeddings = get_embedding_with_spacy(question)

D, I = spacy_sentence_embeddings_index.search(question_embeddings, k=15)
spacy_retrieved_chunk = [list(doc.sents)[i] for i in I.tolist()[0]]

In [125]:
for i, item in enumerate(spacy_retrieved_chunk):
  print(f'{i}: {item}')

0: Well, how had I chosen what to work on in the past?
1: What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn't much left for these supposed ultimate truths.
2: But for the first few years I was still able to work on other things.


3: One of the most conspicuous patterns I've noticed in my life is how well it has worked, for me at least, to work on things that weren't prestigious.
4: [12]

I've worked on several different things, but to the extent there was a turning point where I figured out what to work on, it was when I started publishing essays online.
5: 

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming.
6: One day in late 1994 as I was stretching one of these monsters there was something on the radio about a famous fund manager.
7: So I gave this talk, in the course of which I told them that the best sources of seed funding were suc

In [130]:
def run_mistral_with_spacy(question):
  question_embeddings = get_embedding_with_spacy(question)

  D, I = spacy_sentence_embeddings_index.search(question_embeddings, k=15)
  spacy_retrieved_chunk = [list(doc.sents)[i] for i in I.tolist()[0]]
  
  for i, item in enumerate(spacy_retrieved_chunk):
    print(f'{i}: {item}')
  
  prompt = get_prompt(spacy_retrieved_chunk, question)
  
  return run_mistral(prompt)

In [145]:
run_mistral_with_spacy('When author got his first computer?')

0: He could see I worked hard, and gave me a good grade, which he wrote down in a sort of passport each student had.
1: I remember when my friend Robert Morris got kicked out of Cornell for writing the internet worm of 1988, I was envious that he'd found such a spectacular way to get out of grad school.


2: When the Harvard Computer Society, the undergrad computer club, asked me to give a talk, I decided I would tell them how to start a startup.
3: I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book.
4: Robert was now a postdoc at MIT, and though he'd made a lot of money the last time I'd lured him into working on one of my schemes, it had also been a huge time sink.
5: When I said I was leaving, my boss at Yahoo had a long conversation with me about my plans.
6: [12]

I've worked on several different things, but to the extent there was a turning point where I figured out what to work on, i

'The context information provided does not specify when the author got his first computer.'

### Retrieval with OpenAi embeddings

In [111]:
sentences = list(doc.sents)
sentences_100 = sentences[:100]

In [8]:
def get_embedding_with_openai(input):
  api_key = os.getenv('OPENAI_KEY')
  url = "https://api.openai.com/v1/embeddings"
  
  if type(input) != str:
    input = str(input)
  
  headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
  }
  
  payload = {
      "input": input,
      "model": "text-embedding-3-small"
  }
  
  try:  
    response = requests.post(url, headers=headers, json=payload)

    embedding = response.json()['data'][0]['embedding']    
    embedding = np.array(embedding)
  except Exception as exception:
    print(exception)
    
    return None
    
  return embedding
  


In [11]:
openai_question_embeddings = get_embedding_with_openai(question)
openai_question_embeddings = np.array([openai_question_embeddings])

In [112]:
openai_sentence_embeddings = np.array([get_embedding_with_openai(sent.text) for sent in sentences_100])

In [113]:
d = openai_sentence_embeddings.shape[1]
openai_sentence_embeddings_index = faiss.IndexFlatL2(d)
openai_sentence_embeddings_index.add(openai_sentence_embeddings)

In [117]:
print(openai_question_embeddings.shape)
print(openai_sentence_embeddings.shape)
len(openai_sentence_embeddings)

(1, 1536)
(100, 1536)


100

In [121]:
D, I = openai_sentence_embeddings_index.search(openai_question_embeddings, k=15)
openai_retrieved_chunk = [list(doc.sents)[i] for i in I.tolist()[0]]

In [122]:
for i, item in enumerate(openai_retrieved_chunk):
  print(f'{i}: {item}')

0: 

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming.
1: In college I was going to study philosophy, which sounded much more powerful.
2: I wrote simple games, a program to predict how high my model rockets would fly, and a word processor that my father used to write at least one book.
3: What I discovered when I got to college was that the other fields took up so much of the space of ideas that there wasn't much left for these supposed ultimate truths.
4: The book, On Lisp, wasn't published till 1993, but I wrote much of it in grad school.


5: Though I liked programming, I didn't plan to study it in college.
6: This was more like it; this was what I had expected college to do.
7: I had gotten into a program at Cornell that didn't make you choose a major.
8: I wrote what beginning writers were supposed to write then, and probably still are: short stories.
9: I didn't write essays.
10: It seemed, to my nai

### Precision and recall 

In [127]:
for i, item in enumerate(sentences):
  print(f'{i}: {item}')

0: 

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming.
1: I didn't write essays.
2: I wrote what beginning writers were supposed to write then, and probably still are: short stories.
3: My stories were awful.
4: They had hardly any plot, just characters with strong feelings, which I imagined made them deep.


5: The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing."
6: This was in 9th grade, so I was 13 or 14.
7: The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it.
8: It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.


9: The language we used was an early version of Fortran.
10: You had to type programs 

## Sentence transformer test

In [14]:
sentenceTransformer = SentenceTransformer("all-MiniLM-L6-v2")

In [35]:
sentences = list(doc.sents)

In [36]:
sentences

[
 
 What I Worked On
 
 February 2021
 
 Before college the two main things I worked on, outside of school, were writing and programming.,
 I didn't write essays.,
 I wrote what beginning writers were supposed to write then, and probably still are: short stories.,
 My stories were awful.,
 They had hardly any plot, just characters with strong feelings, which I imagined made them deep.
 ,
 The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing.",
 This was in 9th grade, so I was 13 or 14.,
 The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it.,
 It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.
 ,
 The language we used was an early version of Fortran.,
 You had to type programs on punc

In [37]:
sentenceTransformerEmbeddings = sentenceTransformer.encode(sentences)

In [38]:
print(sentenceTransformerEmbeddings.shape)

d = sentenceTransformerEmbeddings.shape[1]
sentenceTransformerEmbeddings_index = faiss.IndexFlatL2(d)
sentenceTransformerEmbeddings_index.add(sentenceTransformerEmbeddings)

(756, 384)


In [39]:
print(question)
sentenceTransformerQuestionEmbedding = sentenceTransformer.encode([question])

What were the two main activities the author worked on before college?


In [43]:
D, I = sentenceTransformerEmbeddings_index.search(sentenceTransformerQuestionEmbedding, k=3)
sentenceTransformer_retrieved_chunk = [list(doc.sents)[i] for i in I.tolist()[0]]

In [45]:
I.tolist()[0]

[720, 266, 109]

In [44]:
sentenceTransformer_retrieved_chunk

[An essay must tell readers things they don't already know, and some people dislike being told such things.
 ,
 Robert wrote a shopping cart, and I wrote a new site generator for stores — in Lisp, of course.
 ,
 Grad students could take classes in any department, and my advisor, Tom Cheatham, was very easy going.]

In [None]:
# TODO: ask chatgtp about results with examples. Why it worked bad?; check params of sentece transformer 
# also use on building precision/recall metrics. How to better formulate it
# need to build argumentation for the chosen model
# use more examples; check for existing becnhmarks; use more examples (+-1000 or more)

## Sentence transformers with chunking instead of sentence based 

In [17]:
sentenceTransformerEmbeddingsWithChunks = sentenceTransformer.encode(chunks)

NameError: name 'sentenceTransformer' is not defined

In [51]:
d = sentenceTransformerEmbeddingsWithChunks.shape[1]
sentenceTransformerEmbeddingsWithChunks_index = faiss.IndexFlatL2(d)
sentenceTransformerEmbeddingsWithChunks_index.add(sentenceTransformerEmbeddingsWithChunks)

In [63]:
D, I = sentenceTransformerEmbeddingsWithChunks_index.search(sentenceTransformerQuestionEmbedding, k=5)
sentenceTransformerEmbeddingsWithChunks_retrieved_chunk = [chunks[i] for i in I.tolist()[0]]

In [67]:
for i, chunk in enumerate(sentenceTransformerEmbeddingsWithChunks_retrieved_chunk):
  print(f'{i}: {chunk}')

0: g art classes at Harvard. Grad students could take classes in any department, and my advisor, Tom Cheatham, was very easy going. If he even knew about the strange classes I was taking, he never said anything.

So now I was in a PhD program in computer science, yet planning to be an artist, yet also genuinely in love with Lisp hacking and working away at On Lisp. In other words, like many a grad student, I was working energetically on multiple projects that were not my thesis.

I didn't see a way out of this situation. I didn't want to drop out of grad school, but how else was I going to get out? I remember when my friend Robert Morris got kicked out of Cornell for writing the internet worm of 1988, I was envious that he'd found such a spectacular way to get out of grad school.

Then one day in April 1990 a crack appeared in the wall. I ran into professor Cheatham and he asked if I was far enough along to graduate that June. I didn't have a word of my dissertation written, but in wha

## Experiment with overlapping chunking

In [6]:
questions = [
"What were the first two things the author worked on outside of school before college?",
"What challenges did the author face when programming on the IBM 1401 in junior high school?",
"What impact did microcomputers have on the author's programming experience compared to earlier computing methods?",
"Why did the author initially choose philosophy as a field of study in college, and why did they switch to AI?",
"What motivated the author to reverse-engineer SHRDLU and how did it influence their undergraduate thesis?",
"Why did the author decide to leave the Accademia di Belli Arti and return to the U.S. after one year?",
"What realization did the author have about the 'low end eating the high end' while working at Interleaf, and how did it influence their future decisions?",
"What led to the creation of Viaweb, and how did the author and their team approach developing the software?",
"How did the idea for Y Combinator emerge and what were its founding principles?",
"Why did the author stop painting in 2014 after focusing on it for almost a year?",
"What role did punch cards play in the author's early programming experiences?",
"What specific influence did 'The Moon is a Harsh Mistress' have on the author's interest in AI?",
"What were the limitations of SHRDLU that the author discovered in grad school?",
"Why did the author decide to write a book about Lisp hacking, and what did they learn from the process?",
"How did the author describe the contrast between theory and systems in computer science?",
"What prompted the author to start taking art classes while pursuing a PhD in computer science?",
"How did the author's visit to the Carnegie Institute influence their decision to pursue painting?",
"What were the author's initial challenges when painting still lifes at the Accademia?",
"What lessons about the software industry did the author learn while working at Interleaf?",
"Why did the author choose to work on low-end software and how did it prove beneficial?",
"How did the author's idea for a web app influence the development of Viaweb?",
"What were the three main parts of Viaweb's software, and who was responsible for each?",
"What mistake did the author make when judging Viaweb's growth rate in its early stages?",
"What realization led the author to sell Viaweb to Yahoo, and how did it impact their lifestyle?",
"How did the author reflect on their decision to focus on painting after leaving Yahoo?",
"What was the initial reception to the author's first essay posted on Slashdot?",
"How did the author describe the shift from print to online publishing, and how did it affect their writing?",
"What is the significance of the 'Y Combinator' name, and why was it chosen?",
"How did the batch model become a defining feature of Y Combinator?",
"What advantages did the author find in funding startups in batches?",
"What inspired the creation of Hacker News, and how did it evolve over time?",
"Why did the author stop working on Arc, and what role did it play in their projects?",
"How did the author handle disputes between cofounders and other challenging situations at Y Combinator?",
"What advice did Robert Morris give the author about not letting Y Combinator be their last cool thing?",
"What factors led the author to step back from running Y Combinator?",
"What was the author's reasoning behind creating the programming language Bel?",
"What challenges did the author face while writing an interpreter for Bel?",
"How did moving to England influence the author's work on Bel?",
"What realization did the author have about discoveredness while working on Bel?",
"How did the author approach choosing projects after finishing work on Bel?",
"What role did the author's background in Lisp play in shaping their projects?",
"How did the author describe the relationship between still life painting and visual perception?",
"What lessons did the author learn from their experience at RISD and the Accademia?",
"Why did the author describe the painting department at RISD as 'post-rigorous'?",
"What influence did Idelle Weber have on the author's painting career?",
"How did the author fund their second stint at RISD, and what challenges did they face?",
"What motivated the author to experiment with still life painting techniques in New York?",
"How did the World Wide Web influence the author's decision to start Viaweb?",
"What role did Julian Weber play in the creation of Viaweb?",
"What were the challenges of selling ecommerce software in the mid-1990s?",
"Why did the author choose to write essays and publish them online instead of following traditional publishing routes?",
"What is the author's perspective on working on unprestigious projects, and why do they find value in them?",
"How did the author describe their transition from software development to essay writing?",
"What impact did the author believe Y Combinator had on the startup ecosystem?",
"What lessons did the author learn from the initial mistakes they made at Y Combinator?",
"What is the relationship between the growth of Y Combinator and the creation of the YC alumni community?",
"Why did the author feel Hacker News was both a success and a source of stress?",
"What parallels did the author draw between parenting and working with startups?",
"What were the author's reflections on balancing personal and professional life at Y Combinator?",
"What were the defining characteristics of the Summer Founders Program?",
"How did the author describe the evolution of essays in the digital age?",
"What role did Slashdot and other online forums play in the author's early publishing experience?",
"How did the author define 'discoveredness' in the context of programming languages?",
"What lessons did the author learn from their experiments with painting and visual perception?",
"What lessons about startup funding did the author implement at Y Combinator?",
"What is the significance of the author's essay 'Hackers & Painters,' and what themes does it explore?",
"How did the author describe their approach to cooking for groups, and how did it reflect their broader philosophies?",
"What motivated the author to continue writing essays after stepping back from Y Combinator?",
"What is the author's perspective on balancing exploration and focus in their work?",
"What challenges did the author face when creating a Lisp interpreter written in itself?",
"How did the author describe the process of debugging while working on Bel?",
"What impact did Robert Morris's advice have on the author's decision-making process?",
"Why did the author believe 'low end' software had more potential than 'high end' software?",
"What role did consulting work play in the author's transition back to art school?",
"How did the author fund their early painting career in New York?",
"What challenges did the author face while living in Yorkville as a painter?",
"Why did the author reflect on the cultural differences between Silicon Valley and New York?",
"What parallels did the author draw between the evolution of web apps and their own projects?",
"What lessons about artistic independence did the author learn from their time at the Accademia?",
"How did the author define success for their painting and writing projects?",
"What challenges did the author face while balancing artistic and technical pursuits?",
"Why did the author choose to work on Lisp despite its lack of mainstream popularity?",
"What motivated the author to experiment with creating a new programming language?",
"How did the author define the relationship between essays and intellectual exploration?",
"What challenges did the author face during the development of Viaweb's web-based software?",
"What impact did early web technologies have on the author's entrepreneurial journey?",
"Why did the author believe that online essays would evolve as a medium?",
"What lessons did the author learn about human behavior from their experiences with startups?",
"How did the author describe the importance of community in the context of startups?",
"What parallels did the author draw between parenting and startup mentorship?",
"How did the author define the role of independence in their career choices?",
"What challenges did the author face when working on the Bel programming language?",
"Why did the author stop painting, and how did they describe the decision?",
"What lessons did the author learn about focus and productivity from their work on Bel?",
"How did the author describe their move to England and its impact on their work?",
"What was the author's perspective on the relationship between technical work and creativity?",
"What role did curiosity play in the author's approach to painting and writing?",
"How did the author describe the influence of Moore's Law on the software industry?",
"What challenges did the author face when creating the Summer Founders Program?",
"Why did the author describe art school as a 'civilized joke' in some contexts?",
"What lessons about simplicity did the author learn from their work on Viaweb?",
"What parallels did the author draw between essay writing and software development?"
]

ground_truth_indexes = [
0,
2,
3,
5,
8,
23,
27,
36,
66,
81,
1,
6,
10,
11,
12,
14,
13,
21,
26,
27,
35,
40,
44,
46,
50,
57,
58,
94,
68,
69,
73,
75,
76,
77,
79,
82,
86,
87,
88,
84,
22,
28,
29,
31,
24,
23,
51,
32,
37,
40,
61,
60,
64,
66,
68,
96,
65,
92,
84,
67,
71,
62,
74,
63,
66,
41,
81,
76,
59,
31,
25,
21,
30,
32,
54,
64,
66,
86,
87,
75,
66,
89,
94,
95,
60,
20,
50,
24,
36,
73,
95,
42,
33,
52,
31,
84,
21,
87,
79,
20,
67,
65,
95
]

In [7]:
text_splitter = CharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap  = 128
)
docs = text_splitter.create_documents([text])

Created a chunk of size 1203, which is longer than the specified 1024
Created a chunk of size 1025, which is longer than the specified 1024


In [8]:
docs

[Document(metadata={}, page_content='What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.'),
 Document(metadata={}, page_content='The language we used was an earl

In [9]:
text_chunks = []
for doc in docs:
  text_chunks.append(doc.page_content)

In [12]:
for i, text_chunk in enumerate(text_chunks):
  print(f'TEXT_INDEX_{i}: {text_chunk}')

TEXT_INDEX_0: What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.
TEXT_INDEX_1: The language we used was an early version of Fortran. You had to type programs on punch ca

In [13]:
texts_embeddings = sentenceTransformer.encode(text_chunks)
  

NameError: name 'sentenceTransformer' is not defined

In [78]:
d = texts_embeddings.shape[1]
texts_embeddings_index = faiss.IndexFlatL2(d)
texts_embeddings_index.add(texts_embeddings)

In [87]:
def get_closest_texts(question, index, ground_truth):
  question_embedding = sentenceTransformer.encode([question])
  D, I = index.search(question_embedding, k=3)
  texts_embeddings_retrieved_chunk = [text_chunks[i] for i in I.tolist()[0]]
  
  print(f'QUESTION: {question}')
  print(f'CORRECT CHUNK: {ground_truth}')
  print(f'CLOSEST TEXTS:')
  
  for i, chunk in enumerate(texts_embeddings_retrieved_chunk):
    print(f'{i}: {chunk}')
  

  
  print('#################################')

In [88]:
for i, question in enumerate(questions):
  get_closest_texts(question, texts_embeddings_index, text_chunks[ground_truth_indexes[i]])

QUESTION: What were the first two things the author worked on outside of school before college?
CORRECT CHUNK: What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.
CLOSEST

In [24]:
def precision_check(question_embedding, index, ground_truth_index):
  question_embedding = np.array([question_embedding])
  D, I = index.search(question_embedding, k=5)
  texts_embeddings_retrieved_indexes = [i for i in I.tolist()[0]]
  
  if ground_truth_index in texts_embeddings_retrieved_indexes:
    return 1
  
  return 0

def mean_precision_check(question_embeddings, index, ground_truth_indexes):
  recalls = [precision_check(question_embeddings[i], index, ground_truth_indexes[i]) for i in range(len(question_embeddings))]
    
  return np.mean(recalls)


In [118]:
# Precision test for sentence transformer
question_embeddings = sentenceTransformer.encode(questions)
mean_precision = mean_precision_check(question_embeddings, texts_embeddings_index, ground_truth_indexes)
print(mean_precision)

0.3627450980392157


In [121]:
# Precision test for spacy
spacy_embedded_chunks = []
for text_chunk in text_chunks:
  spacy_embedded_chunks.append(get_embedding_with_spacy(text_chunk))

spacy_embedded_chunks = np.array(spacy_embedded_chunks)

print(spacy_embedded_chunks.shape)

d = spacy_embedded_chunks.shape[1]
spacy_embedded_chunks_index = faiss.IndexFlatL2(d)
spacy_embedded_chunks_index.add(spacy_embedded_chunks)

spacy_question_embeddings = np.array([get_embedding_with_spacy(question) for question in questions])

mean_recall = mean_precision_check(spacy_question_embeddings, spacy_embedded_chunks_index, ground_truth_indexes)
print(mean_recall)

(99, 300)
0.18627450980392157


In [18]:
print(len(text_chunks))

99


In [19]:
# Precision test for openai
openai_embedded_chunks = []
for i, text_chunk in enumerate(text_chunks):
  embedded_text_chunk = get_embedding_with_openai(text_chunk)
  openai_embedded_chunks.append(embedded_text_chunk)
  
  print(f'Chunk {i} embedded')

Chunk 0 embedded
Chunk 1 embedded
Chunk 2 embedded
Chunk 3 embedded
Chunk 4 embedded
Chunk 5 embedded
Chunk 6 embedded
Chunk 7 embedded
Chunk 8 embedded
Chunk 9 embedded
Chunk 10 embedded
Chunk 11 embedded
Chunk 12 embedded
Chunk 13 embedded
Chunk 14 embedded
Chunk 15 embedded
Chunk 16 embedded
Chunk 17 embedded
Chunk 18 embedded
Chunk 19 embedded
Chunk 20 embedded
Chunk 21 embedded
Chunk 22 embedded
Chunk 23 embedded
Chunk 24 embedded
Chunk 25 embedded
Chunk 26 embedded
Chunk 27 embedded
Chunk 28 embedded
Chunk 29 embedded
Chunk 30 embedded
Chunk 31 embedded
Chunk 32 embedded
Chunk 33 embedded
Chunk 34 embedded
Chunk 35 embedded
Chunk 36 embedded
Chunk 37 embedded
Chunk 38 embedded
Chunk 39 embedded
Chunk 40 embedded
Chunk 41 embedded
Chunk 42 embedded
Chunk 43 embedded
Chunk 44 embedded
Chunk 45 embedded
Chunk 46 embedded
Chunk 47 embedded
Chunk 48 embedded
Chunk 49 embedded
Chunk 50 embedded
Chunk 51 embedded
Chunk 52 embedded
Chunk 53 embedded
Chunk 54 embedded
Chunk 55 embedded
Ch

In [20]:
openai_embedded_chunks = np.array(openai_embedded_chunks)

openai_embedded_chunks.shape

(99, 1536)

In [23]:
d = openai_embedded_chunks.shape[1]
openai_embedded_chunks_index = faiss.IndexFlatL2(d)
openai_embedded_chunks_index.add(openai_embedded_chunks)

openai_question_embeddings = np.array([get_embedding_with_openai(question) for question in questions])

NameError: name 'mean_precision_check' is not defined

In [25]:
mean_recall = mean_precision_check(openai_question_embeddings, openai_embedded_chunks_index, ground_truth_indexes)
print(mean_recall)

0.4117647058823529


So the results of precision test are
1. OpenAi (text-embedding-3-small): 0.412
2. Sentence transformer (all-MiniLM-L6-v2): 0.323
3. SpaCy (en_core_web_lg): 0.186

# Finalized experiments

In [6]:
from bs4 import BeautifulSoup
import requests
import re
import copy



## Scrapping functions

In [2]:
def scape_the_page(url, use_saved=False):
    # Define headers with a common browser User-Agent
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
    }

    # If url already saved to saved_url folder, read from file
    if use_saved:
        try:
            with open(f'saved_url/{url}', 'r') as file:
                html = file.read()
                soup = BeautifulSoup(html, 'html.parser')
            
            print('Read from file')
        except FileNotFoundError:
            response = requests.get(f'{url}', headers=headers)

            if response.status_code == 200:
                html = response.text
                soup = BeautifulSoup(html, 'html.parser')
                
                # Remove domain from url
                pattern = r'^(?:https?:\/\/)?(?:www\.)?([^\/]+)'
                match = re.search(pattern, url)
                url_without_domain = match.group(1) if match else url
                
                # Save page to a file
                with open(f'saved_url/{url_without_domain}', 'w') as file:
                    file.write(soup.prettify())
                
                print('Read from url')
            else:
                print('Error:', response.status_code)
    else:
        response = requests.get(f'{url}', headers=headers)
        
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            # Remove domain from url
            pattern = r'^(?:https?:\/\/)?(?:www\.)?([^\/]+)'
            match = re.search(pattern, url)
            url_without_domain = match.group(1) if match else url
            
            # Save page to a file
            with open(f'saved_url/{url_without_domain}', 'w') as file:
                file.write(soup.prettify())
            
            print(f'Success on scrapping {url}')
                        
            return soup 
        else:
            print('Error:', response)
            print('Error:', response.status_code)

In [35]:
def clean_soup(soup_to_clean):
  # Remove scripts and styles
  soup_to_clean_copy = copy.copy(soup_to_clean)
  
  for tag in soup_to_clean_copy(['script', 'style', 'img', 'iframe', 'noscript', 'svg', 'video', 'button', 'form', 'header', 'footer', 'nav', 'aside']):
    tag.decompose()
  
  return soup_to_clean_copy
  

In [3]:
def clean_scrapped_page(soup):
  soup_without_footer = copy.deepcopy(soup)

  # Remove the footer tag
  footer = soup_without_footer.find("footer")
  if footer:
      footer.decompose()
  
  scrapped_text = soup_without_footer.get_text()
  # Replace any sequence of 3 or more newlines with 2 newlines
  scrapped_text = re.sub(r'\n{3,}', '\n\n', scrapped_text)
  
  return scrapped_text


In [70]:
def normalize_whitespace(text: str) -> str:
    # Replace all newlines and tabs with spaces
    text = text.replace('\n', ' ').replace('\t', ' ')

    # Replace non-breaking spaces with normal spaces
    text = text.replace('\xa0', ' ')  # common for &nbsp;
    
    # Collapse ALL types of whitespace (incl. multiple spaces) to a single space
    text = re.sub(r'[ \u00A0]{3,}', '  ', text)  # keep max 2 spaces
    text = re.sub(r' {2,}', '  ', text)  # enforce 2-space max again for safety

    # Strip leading/trailing spaces
    return text.strip()

In [4]:
def scrape_clean_and_transform_to_text(url):
  scrapped_soup = scape_the_page(url)
  scrapped_text = clean_scrapped_page(scrapped_soup)
  
  return scrapped_text

## Chunking strategies

In [None]:
chunk_size = 256

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import HTMLSectionSplitter

def recursive_text_splitter(text_to_split):
  text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=256,
    chunk_overlap=25,
    length_function=len,
    is_separator_regex=False,
  )
  
  splitted_text = text_splitter.split_text(text_to_split)
  
  return splitted_text

def html_text_splitter(html_to_split):
  headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4")
  ]

  html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)

  html_header_splits = html_splitter.split_text(html_to_split)
  
  # Remove sequential whitespaces and new lines
  for doc in html_header_splits:
    doc.page_content = normalize_whitespace(doc.page_content)

  chunk_size = 500
  chunk_overlap = 30
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size, chunk_overlap=chunk_overlap
  )
  
  splitted_html = text_splitter.split_documents(html_header_splits)
  
  return splitted_html

## Experiments

In [9]:
urls_to_scrape = [
  'https://aisdr.com/',
  'https://ucu.edu.ua/en/',
  'https://paulgraham.com/greatwork.html',
  'https://www.hubspot.com/',
  'https://www.salesforce.com/',
  'https://kormotech.com/',
  'https://www.notion.com/'
]

url_and_scrapped_texts = {}
for url in urls_to_scrape:
  scrapped_text = scrape_clean_and_transform_to_text(url)
  url_and_scrapped_texts[url] = scrapped_text

test2
test3
Read from url
test2
test3
Read from url
test2
test3
Read from url
test2
test3
Read from url
test2
test3
Read from url
test2
test3
Read from url
test2
test3
Read from url


In [10]:
url_and_scrapped_texts

{'https://aisdr.com/': "  Book more, stress less with AI-powered sales automation - AI SDR                                  Platform      Use cases     Pricing     Case studies     Why AiSDR?     Resources      Book a demo     Log In             Platform     Use cases     Pricing     Case studies     Why AiSDR?     Resources     Book a demo     Log In              What is AiSDR?        Features Every tool you need for AI sales outreach        Independent AI sales assistant An extra pair of hands for your sales growth        Prospecting with AI Find leads with an appetite for your offer        Our best AI emails Clients' favorite emails generated by AiSDR            End-to-end AI Sales Outreach All your bases covered within one solution        AI for HubSpot sales Make the best of your CRM data        Speak with our AI Let AiSDR try and convince you to book a meeting with us      Human or AI? See if you can spot emails that were AI-generated  Play the game            Inbound        Lead

In [11]:
url_and_scrapped_texts['https://kormotech.com/']

'\n\n\n\n\n\n\n\n\n\nPet Food Manufacturer | 5 Brands in 40 Countries | Kormotech\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAbout us\n\n\nBrands & PL\n\n\nNews\n\n\nManufacture\n\n\nOpenings\n\n\nContacts\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nEN\n\n\n\n\n\nUK\nLT\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nValues and story\n\n\nSocial responsibility\n\n\nTeam\n\n\nKormotech in the media\n\n\n\n\nDelickcious\n\n\nOptimeal\n\n\nClub 4 Paws\n\n\nMy love\n\n\nMy Love\n\n\nMaster\n\n\nPrivate Label\n\n\n\n\nDelickcious\n\nPremium food for cats and dogs\n\n\n\n\nOptimeal\n\nSuper premium food for cats and dogs\n\n\n\n\nClub 4 Paws\n\nPremium food for cats and dogs\n\n\n\n\nMy love\n\nStandard food for dogs\n\n\n\n\nMy Love\n\nStandard food for cats\n\n\n\n\nMaster\n\nStandard food for cat and dog\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nEN\n\n\n\n\n\nUK\nLT\n\n\n\n\n

In [23]:
test_updated_scrapping = scrape_clean_and_transform_to_text('https://kormotech.com/')

Success on scrapping https://kormotech.com/


In [25]:
test_splitted_text = recursive_text_splitter(test_updated_scrapping)

In [26]:
test_splitted_text

['Pet Food Manufacturer | 5 Brands in 40 Countries | Kormotech\n\nAbout us\n\nBrands & PL\n\nNews\n\nManufacture\n\nOpenings\n\nContacts\n\nEN\n\nUK\nLT\n\nValues and story\n\nSocial responsibility\n\nTeam\n\nKormotech in the media\n\nDelickcious\n\nOptimeal\n\nClub 4 Paws\n\nMy love',
 'Club 4 Paws\n\nMy love\n\nMy Love\n\nMaster\n\nPrivate Label\n\nDelickcious\n\nPremium food for cats and dogs\n\nOptimeal\n\nSuper premium food for cats and dogs\n\nClub 4 Paws\n\nPremium food for cats and dogs\n\nMy love\n\nStandard food for dogs\n\nMy Love',
 'My Love\n\nStandard food for cats\n\nMaster\n\nStandard food for cat and dog\n\nEN\n\nUK\nLT\n\nAbout us\n\nValues and story\nSocial responsibility\nTeam\nKormotech in the media\n\nBrands & PL\n\nDelickcious\nOptimeal\nClub 4 Paws\nMy love\nMy Love\nMaster\nPrivate Label',
 'News\nManufacture\nOpenings\nContacts\n\nDream\nThink\nCare\n\n                                We dream of seeing \nhealthy pets in happy \nfamilies',
 'Think of our four-p

In [26]:
kormotech_soup = scape_the_page('https://kormotech.com/')


Success on scrapping https://kormotech.com/


In [36]:
cleaned_kormotech_soup = clean_soup(kormotech_soup)

In [37]:
cleaned_kormotech_soup


<!DOCTYPE html>

<html lang="en">
<head><!-- Google Tag Manager -->

<!-- End Google Tag Manager -->
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
<link href="/build/website/img/app/favicon.png" rel="shortcut icon" type="image/x-icon"/>
<title>Pet Food Manufacturer | 5 Brands in 40 Countries | Kormotech</title><meta content="High-quality pet food from the Ukrainian pet food manufacturer - Kormotech. Taking care of your pets is a part of every Kormotech food. Join us!" name="description"><meta content="index,follow" name="robots"><link href="https://kormotech.com/uk" hreflang="uk" rel="alternate"><link href="https://kormotech.com/uk" hreflang="x-default" rel="alternate"/><link href="https://kormotech.com/ru" hreflang="ru" rel="alternate"/><link href="https://kormotech.com/" hreflang="en" rel="alternate"/><link href="https://kormotech.com/lt" hreflang="lt" rel="alterna

In [None]:
kormotech_html = cleaned_kormotech_soup.prettify()


'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <!-- Google Tag Manager -->\n  <!-- End Google Tag Manager -->\n  <meta charset="utf-8"/>\n  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>\n  <link href="/build/website/img/app/favicon.png" rel="shortcut icon" type="image/x-icon"/>\n  <title>\n   Pet Food Manufacturer | 5 Brands in 40 Countries | Kormotech\n  </title>\n  <meta content="High-quality pet food from the Ukrainian pet food manufacturer - Kormotech. Taking care of your pets is a part of every Kormotech food. Join us!" name="description">\n   <meta content="index,follow" name="robots">\n    <link href="https://kormotech.com/uk" hreflang="uk" rel="alternate">\n     <link href="https://kormotech.com/uk" hreflang="x-default" rel="alternate"/>\n     <link href="https://kormotech.com/ru" hreflang="ru" rel="alternate"/>\n     <link href="https://kormotech.com/" hreflang="en" rel="alternat

In [75]:
normalized_kormotech_html = normalize_whitespace(kormotech_html)

In [76]:
normalized_kormotech_html

'<!DOCTYPE html> <html lang="en">  <head>  <!-- Google Tag Manager -->  <!-- End Google Tag Manager -->  <meta charset="utf-8"/>  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>  <link href="/build/website/img/app/favicon.png" rel="shortcut icon" type="image/x-icon"/>  <title>  Pet Food Manufacturer | 5 Brands in 40 Countries | Kormotech  </title>  <meta content="High-quality pet food from the Ukrainian pet food manufacturer - Kormotech. Taking care of your pets is a part of every Kormotech food. Join us!" name="description">  <meta content="index,follow" name="robots">  <link href="https://kormotech.com/uk" hreflang="uk" rel="alternate">  <link href="https://kormotech.com/uk" hreflang="x-default" rel="alternate"/>  <link href="https://kormotech.com/ru" hreflang="ru" rel="alternate"/>  <link href="https://kormotech.com/" hreflang="en" rel="alternate"/>  <link href="https://kormotech.com/lt" h

In [78]:
splitted_kormotech_html = html_text_splitter(normalized_kormotech_html)

In [79]:
splitted_kormotech_html

[Document(metadata={'Header 1': '#TITLE#'}, page_content='Google Tag Manager (noscript)  End Google Tag Manager (noscript)  EN  UK  LT  About us  Values and story  Social responsibility  Team  Kormotech in the media  Brands & PL  Delickcious  Optimeal  Club 4 Paws  My love  My Love  Master  Private Label  News  Manufacture  Openings  Contacts  Use this tag for showing background without video  <div class="hero__bg hero__shadow" style="background-image: url(\'./img/home/hero-1920.webp\')"></div>  Dream  Think  Care  We dream of seeing  healthy pets in'),
 Document(metadata={'Header 1': '#TITLE#'}, page_content='of seeing  healthy pets in happy  families  Think of our four-pawed friends!  We are improving the pet  industry around the world  We care for the  environment and support  social initiatives  More about Kormotech  More about Kormotech'),
 Document(metadata={'Header 2': 'What is Kormotech?'}, page_content='What is Kormotech?  Global cat and dog food manufacturer  Family business 