In [1]:
import os
import chromadb
import openai
import tiktoken
from chromadb.utils import embedding_functions

from chunking_evaluation.utils import openai_token_count
from chunking_evaluation.chunking import ClusterSemanticChunker, LLMSemanticChunker, FixedTokenChunker
from chunking_evaluation.chunking import RecursiveTokenChunker, KamradtModifiedChunker

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import TokenTextSplitter

In [2]:
main_path = "/home/aswath/Projects/deep_learning/backup_brain/test_2/"
input_path = main_path + "input/outsiders.txt"

with open(input_path, 'r') as file:
    document = file.read()

In [3]:
print(document[:1000])

The OUTSIDERS



Introduction

An Intelligent Iconoclasm

It is impossible to produce superior performance unless you do something different.

—John Templeton


The New Yorker’s Atul Gawande uses the term positive deviant to describe unusually effective performers in the field of medicine. To Gawande, it is natural that we should study these outliers in order to learn from them and improve performance.1

Surprisingly, in business the best are not studied as closely as in other fields like medicine, the law, politics, or sports. After studying Henry Singleton, I began, with the help of a talented group of Harvard MBA students, to look for other cases where one company handily beat both its peers and Jack Welch (in terms of relative market performance). It turned out, as Warren Buffett’s quote in the preface suggests, that these companies (and CEOs) were rare as hen’s teeth. After extensive searching in databases at Harvard Business School’s Baker Library, we came across only seven other

### General Helper Functions

In [4]:
def analyze_chunks(chunks, use_tokens=False):
    # Print the chunks of interest
    print("\nNumber of Chunks:", len(chunks))
    print("\n", "="*50, "10th Chunk", "="*50,"\n", chunks[9])
    print("\n", "="*50, "11st Chunk", "="*50,"\n", chunks[10])
    
    chunk1, chunk2 = chunks[9], chunks[10]
    
    if use_tokens:
        encoding = tiktoken.get_encoding("cl100k_base")
        tokens1 = encoding.encode(chunk1)
        tokens2 = encoding.encode(chunk2)
        
        # Find overlapping tokens
        for i in range(len(tokens1), 0, -1):
            if tokens1[-i:] == tokens2[:i]:
                overlap = encoding.decode(tokens1[-i:])
                print("\n", "="*50, f"\nOverlapping text ({i} tokens):", overlap)
                return
        print("\nNo token overlap found")
    else:
        # Find overlapping characters
        for i in range(min(len(chunk1), len(chunk2)), 0, -1):
            if chunk1[-i:] == chunk2[:i]:
                print("\n", "="*50, f"\nOverlapping text ({i} chars):", chunk1[-i:])
                return
        print("\nNo character overlap found")

### Simple Split

In [5]:
def chunk_text(document, chunk_size, overlap):
    chunks = []
    stride = chunk_size - overlap
    current_idx = 0
    
    while current_idx < len(document):
        # Take chunk_size characters starting from current_idx
        chunk = document[current_idx:current_idx + chunk_size]
        if not chunk:  # Break if we're out of text
            break
        chunks.append(chunk)
        current_idx += stride  # Move forward by stride
    
    return chunks

In [6]:
simp_chunks = chunk_text(document, chunk_size=400, overlap=100)
analyze_chunks(simp_chunks)


Number of Chunks: 1059

 . Only two had MBAs. As a group, they did not attract or seek the spotlight. Rather, they labored in relative obscurity and were generally appreciated by only a handful of sophisticated investors and aficionados.

As a group, they shared old-fashioned, premodern values including frugality, humility, independence, and an unusual combination of conservatism and boldness. They typically worked out of

  independence, and an unusual combination of conservatism and boldness. They typically worked out of bare-bones offices (of which they were inordinately proud), generally eschewed perks such as corporate planes, avoided the spotlight wherever possible, and rarely communicated with Wall Street or the business press. They also actively avoided bankers and other advisers, preferring their own counsel

Overlapping text (100 chars):  independence, and an unusual combination of conservatism and boldness. They typically worked out of


### Token Split

In [7]:
def count_tokens(text, model="cl100k_base"):
    encoder = tiktoken.get_encoding(model)
    return print(f"Number of tokens: {len(encoder.encode(text))}")

In [8]:
encoder = tiktoken.get_encoding("cl100k_base")

text = "humpty dumpty sat on the floor"
tokens = encoder.encode(text)

print("Tokens:", tokens)

for i in range(len(tokens)):
    print(f"Token {i+1}:", encoder.decode([tokens[i]]))

print("Full Decoding: ", encoder.decode(tokens))

Tokens: [28400, 1625, 63811, 1625, 7731, 389, 279, 6558]
Token 1: hum
Token 2: pty
Token 3:  dum
Token 4: pty
Token 5:  sat
Token 6:  on
Token 7:  the
Token 8:  floor
Full Decoding:  humpty dumpty sat on the floor


In [9]:
fixed_token_chunker = FixedTokenChunker(chunk_size=68, chunk_overlap=8, encoding_name="cl100k_base")
token_chunks = fixed_token_chunker.split_text(document)
analyze_chunks(token_chunks, use_tokens=True)


Number of Chunks: 1048

  spotlight. Rather, they labored in relative obscurity and were generally appreciated by only a handful of sophisticated investors and aficionados.

As a group, they shared old-fashioned, premodern values including frugality, humility, independence, and an unusual combination of conservatism and boldness. They typically worked out of bare-bones offices (of

  out of bare-bones offices (of which they were inordinately proud), generally eschewed perks such as corporate planes, avoided the spotlight wherever possible, and rarely communicated with Wall Street or the business press. They also actively avoided bankers and other advisers, preferring their own counsel and that of a select group around them. Ben Franklin would

Overlapping text (8 tokens):  out of bare-bones offices (of


In [10]:
count_tokens(token_chunks[0])

Number of tokens: 68


### Recursive Character Split

In [11]:
recursive_character_chunker = RecursiveTokenChunker(chunk_size=400, chunk_overlap=100, length_function=len, separators=["\n\n", "\n", ".", "?", "!", " ", ""])
rec_ch_chunks = recursive_character_chunker.split_text(document)
analyze_chunks(rec_ch_chunks, use_tokens=False)


Number of Chunks: 1253

 The residents of Singletonville, however, represent a refreshing rejoinder to this stereotype. All were first-time CEOs, most with very little prior management experience. Not one came to the job from a high-profile position, and all but one were new to their industries and companies. Only two had MBAs. As a group, they did not attract or seek the spotlight

 . Only two had MBAs. As a group, they did not attract or seek the spotlight. Rather, they labored in relative obscurity and were generally appreciated by only a handful of sophisticated investors and aficionados.

Overlapping text (75 chars): . Only two had MBAs. As a group, they did not attract or seek the spotlight


In [12]:
len(rec_ch_chunks[71])

149

### Recursive Token Split

In [85]:
recursive_token_chunker = RecursiveTokenChunker(chunk_size=120, chunk_overlap=12, length_function=openai_token_count, separators=["\n\n", "\n", ".", "?", "!", " ", ""])
rec_tk_chunks = recursive_token_chunker.split_text(document)
analyze_chunks(rec_tk_chunks, use_tokens=True)


Number of Chunks: 785

 The word iconoclast is derived from Greek and means “smasher of icons.” The word has evolved to have the more general meaning of someone who is determinedly different, proudly eccentric. The original iconoclasts came from outside the societies (and temples) where icons resided; they were challengers of societal norms and conventions, and they were much feared in ancient Greece

 . The CEOs profiled in this book were not nearly so fearsome, but they did share interesting similarities with their ancient forbears: they were also outsiders, disdaining long-accepted conventional approaches (like paying dividends or avoiding share repurchases) and relishing their unorthodoxy.

No token overlap found


### Semantic Chunker

#### Lang Chain Semantic Chunker

In [17]:
openai.api_key = 'dummy_val'

In [18]:
embedding_function = embedding_functions.OpenAIEmbeddingFunction(api_key=openai.api_key, model_name="text-embedding-3-large")

In [19]:
lc_semantic_chunker = SemanticChunker(OpenAIEmbeddings(api_key=openai.api_key))
lc_semantic_chunks = lc_semantic_chunker.create_documents([document])

In [20]:
print("# of Chunks:", len(lc_semantic_chunks), "\n")
print(lc_semantic_chunks[9].page_content)
print("\n\n", "="*50, "\n\n")
print(lc_semantic_chunks[10].page_content)
print("\n\n", "="*50, "\n\n")

count_tokens(lc_semantic_chunks[9].page_content)
count_tokens(lc_semantic_chunks[10].page_content)

# of Chunks: 111 

. take a bite of something else.”3 What’s interesting, however, is that his peers at other media companies didn’t follow this path. Rather, they tended, like CBS, to follow fashion and diversify into unrelated businesses, build large corporate staffs, and overpay for marquee media properties. Capital Cities under Murphy was an extremely successful example of what we would now call a roll-up. In a typical roll-up, a company acquires a series of businesses, attempts to improve operations, and then keeps acquiring, benefiting over time from scale advantages and best management practices. This concept came into vogue in the mid- to late 1990s and flamed out in the early 2000s as many of the leading companies collapsed under the burden of too much debt. These companies typically failed because they acquired too rapidly and underestimated the difficulty and importance of integrating acquisitions and improving operations. Murphy’s approach to the roll-up was different. He m

In [21]:
len(lc_semantic_chunks[9].page_content)

19159

#### Greg Kamradt Semantic Chunker

In [104]:
kamradt_chunker = KamradtModifiedChunker(avg_chunk_size=85, min_chunk_size=8, embedding_function=embedding_function)
modified_kamradt_chunks = kamradt_chunker.split_text(document)

In [105]:
analyze_chunks(modified_kamradt_chunks, use_tokens=True)
print("\n\n", "="*50, "\n\n")
count_tokens(modified_kamradt_chunks[9])
count_tokens(modified_kamradt_chunks[10])


Number of Chunks: 756

 The residents of Singletonville, however, represent a refreshing rejoinder to this stereotype . All were first-time CEOs, most

 with very little prior management experience . Not one came to the job from a high-profile position, and all but one were new to their industries and companies . Only two had MBAs . As a group, they did not attract or seek the spotlight . Rather, they labored in relative obscurity and were generally appreciated by only a handful of sophisticated investors and aficionados . As a group, they shared old-fashioned, premodern values including

No token overlap found




Number of tokens: 24
Number of tokens: 87


#### Cluster Semantic Chunker

In [128]:
cluster_chunker = ClusterSemanticChunker(embedding_function=embedding_function, max_chunk_size=210, length_function=openai_token_count)
cluster_chunker_chunks = cluster_chunker.split_text(document)

In [129]:
analyze_chunks(cluster_chunker_chunks, use_tokens=True)


Number of Chunks: 750

 Like Singleton, these CEOs consistently made very different decisions than their peers did. They were not, however, blindly contrarian . Theirs was an intelligent iconoclasm informed by careful analysis and often expressed in unusual financial metrics that were distinctly different from industry or Wall Street conventions.

 In this way, their iconoclasm was similar to Billy Beane’s as described by Michael Lewis in Moneyball .2 Beane, the general manager of the perennially cash-strapped Oakland A’s baseball team, used statistical analysis to gain an edge over his better-heeled competitors . His approach centered on new metrics—on-base and slugging percentages—that correlated more highly with team winning percentage than the traditional statistical troika of home runs, batting average, and runs batted in. Beane’s analytical insights influenced every aspect of how he ran the A’s—from drafting and trading strategies to whether or not to steal bases or use sacrific

In [132]:
length = 0
for i in cluster_chunker_chunks:
    length += len(i)
print(length/len(cluster_chunker_chunks))

422.776


### Testing Chunks

In [30]:
llm = ChatOpenAI(temperature=0.0, model="gpt-4o", api_key= openai.api_key)

simp_chunks, token_chunks, rec_ch_chunks, rec_tk_chunks, lc_semantic_chunks, modified_kamradt_chunks, cluster_chunker_chunks, llm_chunker_chunks

In [31]:
def add_chunks(texts, collection):
    add_count = 0
    for text in texts:
        collection.add(documents=[text], ids=f"chunk_{add_count}")
        add_count += 1

def chroma_retrieval(query, collection, num_results=15):
    results = collection.query(query_texts=[query], n_results=num_results)
    return results

def chroma_rag(query, collection):
    retrieved_docs = chroma_retrieval(query, collection)["documents"][0]
    response = rag_chain.invoke({"retrieved_docs": retrieved_docs, "query": query})
    return retrieved_docs, response

In [32]:
client_path = main_path + "/notebook/chromadb"
chroma_client = chromadb.PersistentClient(path=client_path)

In [33]:
rag_prompt_template = """
Generate a response that responds to the user's question, summarizing all information in the input data tables, and incorporating any relevant general knowledge.

Do not include information where the supporting evidence for it is not provided.

Context: {retrieved_docs}

User Question: {query}

"""

#### Simple Chunking

In [34]:
simple_collection = chroma_client.get_or_create_collection(name="simple_collection")
# add_chunks(simp_chunks, simple_collection)

In [35]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [36]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", simple_collection)

In [37]:
print(response)

The input data highlights the characteristics and strategies of successful CEOs, often referred to as "outsider CEOs," who have demonstrated effective leadership through unconventional approaches. These CEOs are not necessarily the most charismatic or technically skilled, but they excel in resource allocation and long-term value creation. Here are some key reasons why these CEOs are considered among the best:

1. **Investor Mindset**: These CEOs think more like investors than traditional managers. They focus on optimizing long-term value per share rather than just organizational growth. This perspective allows them to make decisions that prioritize shareholder value.

2. **Resource Allocation**: They emphasize careful deployment of both financial and human resources. This includes leading the allocation process themselves rather than delegating it, ensuring that every investment is scrutinized for its potential return.

3. **Focus on Cash Flow**: Unlike many public company CEOs who foc

#### Token Chunking

In [38]:
token_collection = chroma_client.get_or_create_collection(name="token_collection")
# add_chunks(token_chunks, token_collection)

In [39]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [40]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", token_collection)

In [41]:
print(response)

The context provided discusses the characteristics and philosophies of effective CEOs, particularly those who are considered "outsider CEOs." These CEOs are noted for their unconventional approaches and focus on optimizing long-term value per share rather than pursuing organizational growth for its own sake. Here are some key reasons why these CEOs are considered effective:

1. **Capital Allocation Focus**: These CEOs prioritize understanding and optimizing capital allocation. They carefully deploy company resources to create shareholder value, always considering the return on investment and proceeding only with projects that offer attractive returns based on conservative assumptions.

2. **Investor Mindset**: Unlike traditional managers, these CEOs think more like investors. They have confidence in their analytical skills and are prepared to act boldly when they identify discrepancies between value and price. For example, they might buy back stock when it is undervalued.

3. **Long-te

#### Recursive Character Chunking

In [43]:
rec_ch_collection = chroma_client.get_or_create_collection(name="rec_ch_collection")
add_chunks(rec_ch_chunks, rec_ch_collection)

In [None]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [44]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", rec_ch_collection)

In [45]:
print(response)

The book discusses a group of CEOs who are considered exceptional due to their unconventional management philosophies and strategies. These CEOs are characterized by their focus on optimizing long-term value per share rather than pursuing organizational growth for its own sake. They think more like investors than traditional managers, emphasizing careful resource allocation and capital deployment.

Key reasons these CEOs are considered among the best include:

1. **Investor Mindset**: They approach their roles with an investor's perspective, focusing on discrepancies between value and price and acting boldly when opportunities arise.

2. **Long-term Value Focus**: Their primary goal is to maximize long-term value per share, which contrasts with the common business urge to prioritize growth and short-term earnings.

3. **Resource Allocation**: They excel in allocating both financial and human resources, often employing flat organizational structures and lean corporate staffs to enhance 

#### Recursive Token Chunking

In [91]:
rec_token_collection = chroma_client.get_or_create_collection(name="rec_token_collection")
add_chunks(rec_tk_chunks, rec_token_collection)

In [92]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [93]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", rec_token_collection)

In [94]:
print(response)

The book discussed in the context does not single out specific CEOs as the "best" but rather highlights a group of CEOs who shared a distinctive and effective managerial philosophy. These CEOs are characterized by their focus on capital allocation and long-term value creation rather than short-term growth or charismatic leadership. Here are the key reasons behind their effectiveness:

1. **Investor Mindset**: These CEOs thought more like investors than traditional managers. They were analytical and willing to act boldly when they identified discrepancies between value and price. This approach allowed them to make strategic decisions, such as buying back stock when it was undervalued or using it to acquire other companies when it was overvalued.

2. **Capital Allocation**: A central theme was their disciplined approach to capital allocation. They consistently directed resources toward high-return projects and avoided value-destroying investments. This focus on efficient capital use was 

#### LC Semantic Chunker

In [59]:
lc_sem_chunks = [doc.page_content for doc in lc_semantic_chunks]

In [60]:
lc_sem_collection = chroma_client.get_or_create_collection(name="lc_sem_collection")
add_chunks(lc_sem_chunks, lc_sem_collection)

In [61]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [65]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", lc_sem_collection)

In [66]:
print(response)

The best CEOs, as highlighted in the context provided, are those who have demonstrated exceptional performance by thinking differently and focusing on long-term value creation rather than short-term gains. These CEOs, often referred to as "outsiders," include figures like Warren Buffett, Henry Singleton, and John Malone. Here are the key reasons behind their success:

1. **Capital Allocation Mastery**: These CEOs excelled in capital allocation, a critical skill for creating shareholder value. They focused on deploying resources where they would generate the highest returns, often thinking more like investors than traditional managers. For instance, Warren Buffett's approach at Berkshire Hathaway involved investing in high-return businesses and avoiding low-return ones, which significantly boosted the company's long-term performance.

2. **Contrarian Thinking**: The outsider CEOs often went against conventional wisdom. During periods of economic uncertainty, such as the 1974–1982 econom

#### Modified Kamradt Chunks

In [109]:
mod_kamradt_collections = chroma_client.get_or_create_collection(name="mod_kamradt_collections")
add_chunks(modified_kamradt_chunks, mod_kamradt_collections)

In [110]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [111]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", mod_kamradt_collections)

In [112]:
print(response)

The input data highlights a group of CEOs known for their unconventional and highly effective management styles, often referred to as "outsider CEOs." These leaders are characterized by their focus on capital allocation, decentralization, and long-term value creation, rather than short-term profit maximization or charismatic leadership.

1. **Capital Allocation**: These CEOs prioritize careful deployment of resources to maximize shareholder value. They focus on both the numerator (total company value) and the denominator (shares outstanding) by making strategic investments and share repurchases. This approach is exemplified by ExxonMobil's aggressive stock buybacks, which were made as investments rather than to manipulate stock prices.

2. **Decentralization**: A hallmark of their management style is running highly decentralized organizations. This involves granting significant autonomy to local managers, as seen in companies like Capital Cities, where decisions are made at the local l

#### Cluster Chunker

In [133]:
cluster_chunks_collection = chroma_client.get_or_create_collection(name="cluster_chunks_collections")
add_chunks(cluster_chunker_chunks, cluster_chunks_collection)

In [134]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [135]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", mod_kamradt_collections)

In [136]:
print(response)

The best CEOs, as highlighted in the context, are those who adopt an unconventional and highly effective approach to management and capital allocation. These CEOs, often referred to as "outsider CEOs," share several key characteristics that set them apart:

1. **Radical Rationality and Long-term Perspective**: These CEOs think like long-term investors rather than high-paid employees. They focus on capital allocation and resource deployment to create shareholder value, always considering the return on investment before proceeding with projects.

2. **Decentralization and Autonomy**: They run highly decentralized organizations, granting significant autonomy to local managers. This approach acknowledges that real value is often created at the local level, not at headquarters. Capital Cities is a prime example, where decentralization was a core philosophy.

3. **Focus on Cash Flow and Shareholder Value**: Instead of chasing quarterly earnings, these CEOs prioritize cash flow and shareholde