In [150]:
import os
import chromadb
import openai
import tiktoken
from chromadb.utils import embedding_functions

from chunking_evaluation.utils import openai_token_count
from chunking_evaluation.chunking import ClusterSemanticChunker, LLMSemanticChunker, FixedTokenChunker
from chunking_evaluation.chunking import RecursiveTokenChunker, KamradtModifiedChunker

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import TokenTextSplitter

In [36]:
main_path = "/home/aswath/Projects/deep_learning/backup_brain/test_2/"
input_path = main_path + "input/outsiders.txt"

with open(input_path, 'r') as file:
    document = file.read()

In [37]:
print(document[:1000])

The OUTSIDERS



Introduction

An Intelligent Iconoclasm

It is impossible to produce superior performance unless you do something different.

—John Templeton


The New Yorker’s Atul Gawande uses the term positive deviant to describe unusually effective performers in the field of medicine. To Gawande, it is natural that we should study these outliers in order to learn from them and improve performance.1

Surprisingly, in business the best are not studied as closely as in other fields like medicine, the law, politics, or sports. After studying Henry Singleton, I began, with the help of a talented group of Harvard MBA students, to look for other cases where one company handily beat both its peers and Jack Welch (in terms of relative market performance). It turned out, as Warren Buffett’s quote in the preface suggests, that these companies (and CEOs) were rare as hen’s teeth. After extensive searching in databases at Harvard Business School’s Baker Library, we came across only seven other

### General Helper Functions

In [54]:
def analyze_chunks(chunks, use_tokens=False):
    # Print the chunks of interest
    print("\nNumber of Chunks:", len(chunks))
    print("\n", "="*50, "10th Chunk", "="*50,"\n", chunks[9])
    print("\n", "="*50, "11st Chunk", "="*50,"\n", chunks[10])
    
    chunk1, chunk2 = chunks[9], chunks[10]
    
    if use_tokens:
        encoding = tiktoken.get_encoding("cl100k_base")
        tokens1 = encoding.encode(chunk1)
        tokens2 = encoding.encode(chunk2)
        
        # Find overlapping tokens
        for i in range(len(tokens1), 0, -1):
            if tokens1[-i:] == tokens2[:i]:
                overlap = encoding.decode(tokens1[-i:])
                print("\n", "="*50, f"\nOverlapping text ({i} tokens):", overlap)
                return
        print("\nNo token overlap found")
    else:
        # Find overlapping characters
        for i in range(min(len(chunk1), len(chunk2)), 0, -1):
            if chunk1[-i:] == chunk2[:i]:
                print("\n", "="*50, f"\nOverlapping text ({i} chars):", chunk1[-i:])
                return
        print("\nNo character overlap found")

### Simple Split

In [39]:
def chunk_text(document, chunk_size, overlap):
    chunks = []
    stride = chunk_size - overlap
    current_idx = 0
    
    while current_idx < len(document):
        # Take chunk_size characters starting from current_idx
        chunk = document[current_idx:current_idx + chunk_size]
        if not chunk:  # Break if we're out of text
            break
        chunks.append(chunk)
        current_idx += stride  # Move forward by stride
    
    return chunks

In [57]:
simp_chunks = chunk_text(document, chunk_size=400, overlap=100)
analyze_chunks(simp_chunks)


Number of Chunks: 1059

 . Only two had MBAs. As a group, they did not attract or seek the spotlight. Rather, they labored in relative obscurity and were generally appreciated by only a handful of sophisticated investors and aficionados.

As a group, they shared old-fashioned, premodern values including frugality, humility, independence, and an unusual combination of conservatism and boldness. They typically worked out of

  independence, and an unusual combination of conservatism and boldness. They typically worked out of bare-bones offices (of which they were inordinately proud), generally eschewed perks such as corporate planes, avoided the spotlight wherever possible, and rarely communicated with Wall Street or the business press. They also actively avoided bankers and other advisers, preferring their own counsel

Overlapping text (100 chars):  independence, and an unusual combination of conservatism and boldness. They typically worked out of


### Token Split

In [58]:
def count_tokens(text, model="cl100k_base"):
    encoder = tiktoken.get_encoding(model)
    return print(f"Number of tokens: {len(encoder.encode(text))}")

In [59]:
encoder = tiktoken.get_encoding("cl100k_base")

text = "humpty dumpty sat on the floor"
tokens = encoder.encode(text)

print("Tokens:", tokens)

for i in range(len(tokens)):
    print(f"Token {i+1}:", encoder.decode([tokens[i]]))

print("Full Decoding: ", encoder.decode(tokens))

Tokens: [28400, 1625, 63811, 1625, 7731, 389, 279, 6558]
Token 1: hum
Token 2: pty
Token 3:  dum
Token 4: pty
Token 5:  sat
Token 6:  on
Token 7:  the
Token 8:  floor
Full Decoding:  humpty dumpty sat on the floor


In [192]:
fixed_token_chunker = FixedTokenChunker(chunk_size=68, chunk_overlap=8, encoding_name="cl100k_base")
token_chunks = fixed_token_chunker.split_text(document)
analyze_chunks(token_chunks, use_tokens=True)


Number of Chunks: 1048

  spotlight. Rather, they labored in relative obscurity and were generally appreciated by only a handful of sophisticated investors and aficionados.

As a group, they shared old-fashioned, premodern values including frugality, humility, independence, and an unusual combination of conservatism and boldness. They typically worked out of bare-bones offices (of

  out of bare-bones offices (of which they were inordinately proud), generally eschewed perks such as corporate planes, avoided the spotlight wherever possible, and rarely communicated with Wall Street or the business press. They also actively avoided bankers and other advisers, preferring their own counsel and that of a select group around them. Ben Franklin would

Overlapping text (8 tokens):  out of bare-bones offices (of


In [193]:
count_tokens(token_chunks[0])

Number of tokens: 68


### Recursive Character Split

In [62]:
recursive_character_chunker = RecursiveTokenChunker(chunk_size=400, chunk_overlap=100, length_function=len, separators=["\n\n", "\n", ".", "?", "!", " ", ""])
rec_ch_chunks = recursive_character_chunker.split_text(document)
analyze_chunks(rec_ch_chunks, use_tokens=False)


Number of Chunks: 1253

 The residents of Singletonville, however, represent a refreshing rejoinder to this stereotype. All were first-time CEOs, most with very little prior management experience. Not one came to the job from a high-profile position, and all but one were new to their industries and companies. Only two had MBAs. As a group, they did not attract or seek the spotlight

 . Only two had MBAs. As a group, they did not attract or seek the spotlight. Rather, they labored in relative obscurity and were generally appreciated by only a handful of sophisticated investors and aficionados.

Overlapping text (75 chars): . Only two had MBAs. As a group, they did not attract or seek the spotlight


In [63]:
len(rec_ch_chunks[71])

149

### Recursive Token Split

In [64]:
recursive_token_chunker = RecursiveTokenChunker(chunk_size=400, chunk_overlap=100, length_function=openai_token_count, separators=["\n\n", "\n", ".", "?", "!", " ", ""])
rec_tk_chunks = recursive_token_chunker.split_text(document)
analyze_chunks(rec_tk_chunks, use_tokens=True)


Number of Chunks: 205

 In all cases, this led the outsider CEOs to focus on cash flow and to forgo the blind pursuit of the Wall Street holy grail of reported earnings. Most public company CEOs focus on maximizing quarterly reported net income, which is understandable since that is Wall Street’s preferred metric. Net income, however, is a bit of a blunt instrument and can be significantly distorted by differences in debt levels, taxes, capital expenditures, and past acquisition history.

As a result, the outsiders (who often had complicated balance sheets, active acquisition programs, and high debt levels) believed the key to long-term value creation was to optimize free cash flow, and this emphasis on cash informed all aspects of how they ran their companies—from the way they paid for acquisitions and managed their balance sheets to their accounting policies and compensation systems.

This single-minded cash focus was the foundation of their iconoclasm, and it invariably led to a la

### Semantic Chunker

#### Lang Chain Semantic Chunker

In [87]:
openai.api_key = 'dummy_val'

In [84]:
embedding_function = embedding_functions.OpenAIEmbeddingFunction(api_key=openai.api_key, model_name="text-embedding-3-large")

In [86]:
lc_semantic_chunker = SemanticChunker(OpenAIEmbeddings(api_key=openai.api_key))
lc_semantic_chunks = lc_semantic_chunker.create_documents([document])

In [90]:
print("# of Chunks:", len(lc_semantic_chunks), "\n")
print(lc_semantic_chunks[9].page_content)
print("\n\n", "="*50, "\n\n")
print(lc_semantic_chunks[10].page_content)
print("\n\n", "="*50, "\n\n")

count_tokens(lc_semantic_chunks[9].page_content)
count_tokens(lc_semantic_chunks[10].page_content)

# of Chunks: 111 

. take a bite of something else.”3 What’s interesting, however, is that his peers at other media companies didn’t follow this path. Rather, they tended, like CBS, to follow fashion and diversify into unrelated businesses, build large corporate staffs, and overpay for marquee media properties. Capital Cities under Murphy was an extremely successful example of what we would now call a roll-up. In a typical roll-up, a company acquires a series of businesses, attempts to improve operations, and then keeps acquiring, benefiting over time from scale advantages and best management practices. This concept came into vogue in the mid- to late 1990s and flamed out in the early 2000s as many of the leading companies collapsed under the burden of too much debt. These companies typically failed because they acquired too rapidly and underestimated the difficulty and importance of integrating acquisitions and improving operations. Murphy’s approach to the roll-up was different. He m

In [94]:
len(lc_semantic_chunks[9].page_content)

19159

#### Greg Kamradt Semantic Chunker

In [98]:
kamradt_chunker = KamradtModifiedChunker(avg_chunk_size=400, min_chunk_size=50, embedding_function=embedding_function)
modified_kamradt_chunks = kamradt_chunker.split_text(document)

In [102]:
analyze_chunks(modified_kamradt_chunks, use_tokens=True)
print("\n\n", "="*50, "\n\n")
count_tokens(modified_kamradt_chunks[9])
count_tokens(modified_kamradt_chunks[10])


Number of Chunks: 158


 . Although they arrived at their management philosophies independently, what’s striking is how remarkably similar the ingredients were across this group of executives despite widely varying industries and circumstances. Each ran a highly decentralized organization; made at least one very large acquisition; developed unusual, cash flow–based metrics; and bought back a significant amount of stock. None paid meaningful dividends or provided Wall Street guidance . All received the same combination of derision, wonder, and skepticism from their peers and the business press. All also enjoyed eye-popping, credulity-straining performance over very long tenures (twenty-plus years on average). The business world has traditionally divided itself into two basic camps: those who run companies and those who invest in them . The lessons of these iconoclastic CEOs suggest a new, more nuanced conception of the chief executive’s job, with less emphasis placed on charismatic lea

#### Cluster Semantic Chunker

In [103]:
cluster_chunker = ClusterSemanticChunker(embedding_function=embedding_function, max_chunk_size=400, length_function=openai_token_count)
cluster_chunker_chunks = cluster_chunker.split_text(document)

In [104]:
analyze_chunks(cluster_chunker_chunks, use_tokens=True)


Number of Chunks: 481

 .


No token overlap found


In [109]:
for i in cluster_chunker_chunks:
    print(i)
    print("\n\n" + "-"*100 + "\n\n")

The OUTSIDERS



Introduction

An Intelligent Iconoclasm

It is impossible to produce superior performance unless you do something different.

—John Templeton The New Yorker’s Atul Gawande uses the term positive deviant to describe unusually effective performers in the field of medicine. To Gawande, it is natural that we should study these outliers in order to learn from them and improve performance.1 Surprisingly, in business the best are not studied as closely as in other fields like medicine, the law, politics, or sports . After studying Henry Singleton, I began, with the help of a talented group of Harvard MBA students, to look for other cases where one company handily beat both its peers and Jack Welch (in terms of relative market performance) . It turned out, as Warren Buffett’s quote in the preface suggests, that these companies (and CEOs) were rare as hen’s teeth . After extensive searching in databases at Harvard Business School’s Baker Library, we came across only seven other

#### LLM Chunker

In [110]:
llm_chunker = LLMSemanticChunker(organisation="openai", model_name="gpt-4o", api_key=openai.api_key)
llm_chunker_chunks = llm_chunker.split_text(document)

Processing chunks: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2026/2026 [03:51<00:00,  8.76it/s]


In [111]:
analyze_chunks(llm_chunker_chunks, use_tokens=True)


Number of Chunks: 278

 A Distant Mirror: 1974–1982 In assessing the current relevance of these outsider CEOs, it’s worth looking at how each navigated the post–World War II period that looks most like today’s extended economic malaise: the brutal 1974–1982 period. That period featured a toxic combination of an external oil shock, disastrous fiscal and monetary policy, and the worst domestic political scandal in the nation’s history . This cocktail of negative news produced an eight-year period that saw crippling inflation, two deep recessions (and bear markets), 18 percent interest rates, a threefold increase in oil prices, and the first resignation of a sitting US president in over one hundred years . In the middle of this dark period, in August 1979, BusinessWeek famously ran a cover story titled “Are Equities Dead?” The times, like now, were so uncertain and scary that most managers sat on their hands, but for all the outsider CEOs it was among the most active periods of their car

### Testing Chunks

In [125]:
llm = ChatOpenAI(temperature=0.0, model="gpt-4o", api_key= openai.api_key)

simp_chunks, token_chunks, rec_ch_chunks, rec_tk_chunks, lc_semantic_chunks, modified_kamradt_chunks, cluster_chunker_chunks, llm_chunker_chunks

In [139]:
def add_chunks(texts, collection):
    add_count = 0
    for text in texts:
        collection.add(documents=[text], ids=f"chunk_{add_count}")
        add_count += 1

def chroma_retrieval(query, collection, num_results=15):
    results = collection.query(query_texts=[query], n_results=num_results)
    return results

def chroma_rag(query, collection):
    retrieved_docs = chroma_retrieval(query, collection)["documents"][0]
    response = rag_chain.invoke({"retrieved_docs": retrieved_docs, "query": query})
    return retrieved_docs, response

In [140]:
client_path = main_path + "/notebook/chromadb"
chroma_client = chromadb.PersistentClient(path=client_path)

In [197]:
rag_prompt_template = """
Generate a response that responds to the user's question, summarizing all information in the input data tables, and incorporating any relevant general knowledge.

Do not include information where the supporting evidence for it is not provided.

Context: {retrieved_docs}

User Question: {query}

"""

#### Simple Chunking

In [None]:
simple_collection = chroma_client.get_or_create_collection(name="simple_collection")
# add_chunks(simp_chunks, simple_collection)

In [None]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [144]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", simple_collection)

In [145]:
print(response)

The input data highlights a group of CEOs who are considered exceptional due to their unconventional approaches to leadership and resource allocation. These CEOs are characterized by their focus on long-term value creation rather than short-term organizational growth. They think more like investors than traditional managers, emphasizing careful deployment of both financial and human resources.

Key reasons these CEOs stand out include:

1. **Investor Mindset**: They prioritize optimizing long-term value per share over immediate growth, which aligns more with the perspective of a long-term investor or owner rather than a high-paid employee.

2. **Resource Allocation**: They lead the resource allocation process themselves, rather than delegating it to finance or business development teams. This hands-on approach ensures that decisions are made with a clear understanding of the company's strategic goals.

3. **Focus on Cash Flow**: Unlike many public company CEOs who focus on maximizing q

#### Token Chunking

In [198]:
token_collection = chroma_client.get_or_create_collection(name="token_collection")
add_chunks(token_chunks, token_collection)

In [199]:
rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)
rag_chain = rag_prompt | llm | StrOutputParser()

In [200]:
docs, response = chroma_rag("Give me the best CEO's and elaborate reasons behind it ?", token_collection)

In [201]:
print(response)

The context provided discusses the characteristics and philosophies of effective CEOs, particularly those who are considered "outsider CEOs." These CEOs are noted for their unconventional approaches and focus on optimizing long-term value per share rather than pursuing organizational growth for its own sake. Here are some key reasons why these CEOs are considered effective:

1. **Capital Allocation Focus**: These CEOs prioritize understanding and optimizing capital allocation. They carefully deploy company resources to create shareholder value, always considering the return on investment and proceeding only with projects that offer attractive returns based on conservative assumptions.

2. **Investor Mindset**: Unlike traditional managers, these CEOs think more like investors. They have confidence in their analytical skills and are prepared to act boldly when they identify discrepancies between value and price. For example, they might buy back stock when it is undervalued.

3. **Long-te