# Enhanced code QA
This notebook expands a bit on the standard langchain code comprehension example

In [1]:
import os
import getpass

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma 

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')


## Clone github repository locally 

In [2]:
repo_url='https://github.com/twitter/the-algorithm'

In [3]:
from git import Repo

# Cloning the base repository

# if repo already exists, assign Repo(path), if not, run Repo.clone_from
# this conditional is here simply for convenience in case the notebook is run more than once
 
if os.path.exists(f'./repos/{repo_url.split("/")[-1]}'):
    repo = Repo(f'./repos/{repo_url.split("/")[-1]}')
else:
    repo = Repo.clone_from(
        repo_url, to_path=f'./repos/{repo_url.split("/")[-1]}'
    )
repo_path_local = f'./repos/{repo_url.split("/")[-1]}/'



## Loading the raw repository into documents

In [4]:
# allow user to specify the branch to use
# default to main if not specified

repo_branch = 'main'

In [5]:
from langchain.document_loaders import GitLoader

git_loader = GitLoader(repo_path=repo_path_local, branch=repo_branch)

In [6]:
code_documents = git_loader.load()

In [7]:
file_types = [doc.metadata['file_type'] for doc in code_documents]
file_types.sort()

# print the number of occurences of each element in the list
from collections import Counter

type_counts = Counter(file_types)
print(type_counts)



Counter({'.scala': 4142, '.java': 1043, '': 716, '.py': 180, '.bazel': 175, '.thrift': 114, '.proto': 90, '.md': 79, '.cpp': 51, '.h': 41, '.rs': 30, '.workflow': 25, '.sql': 23, '.d6w': 21, '.xml': 19, '.yml': 16, '.sh': 14, '.aurora': 13, '.ini': 8, '.rst': 8, '.toml': 4, '.cfg': 3, '.txt': 2, '.graphql': 1, '.json': 1})


In [8]:
# Create a dict with the number of characters per file type, as an easy proxy for token count

characters_by_file_type = {}
for doc in code_documents:
    file_type = doc.metadata['file_type']
    page_content_length = len(doc.page_content)

    if file_type in characters_by_file_type:
        characters_by_file_type[file_type] += page_content_length
    else:
        characters_by_file_type[file_type] = page_content_length

print(characters_by_file_type)


{'': 1126760, '.md': 157756, '.sh': 9162, '.bazel': 300422, '.ini': 1229, '.cfg': 221, '.py': 931783, '.toml': 3058, '.rs': 273037, '.rst': 61398, '.graphql': 294, '.yml': 104014, '.h': 86679, '.workflow': 27040, '.aurora': 64963, '.proto': 342255, '.xml': 96342, '.scala': 14857532, '.cpp': 366062, '.txt': 3092, '.sql': 49744, '.thrift': 563818, '.java': 5225205, '.json': 1352, '.d6w': 26701}


In [9]:
# sort dictionary by value
characters_by_file_type = dict(sorted(characters_by_file_type.items(), key=lambda item: item[1], reverse=True))
print(characters_by_file_type)

{'.scala': 14857532, '.java': 5225205, '': 1126760, '.py': 931783, '.thrift': 563818, '.cpp': 366062, '.proto': 342255, '.bazel': 300422, '.rs': 273037, '.md': 157756, '.yml': 104014, '.xml': 96342, '.h': 86679, '.aurora': 64963, '.rst': 61398, '.sql': 49744, '.workflow': 27040, '.d6w': 26701, '.sh': 9162, '.txt': 3092, '.toml': 3058, '.json': 1352, '.ini': 1229, '.graphql': 294, '.cfg': 221}


## This dictionary can be formatted and output to user in the UI.
While a standard filter can be applied that ignores certain types of files by default, giving the option for the user to filter out files to ignore after being presented with this information can drastically decrease the size of the vector database and potentially also lead to better output from the LLM.

## Select which file extensions to ignore 
Can be easily put into CLI or GUI, but done manually for the purposes of this notebook.

In [10]:
# doc_ignore_list = ['.snap','.spec','.lock']
# doc_ignore_types = set(doc_ignore_list)

We are not ignoring any files in this case, mainly so we can benchmark against the standard retrievalQA notebook

In [11]:
# relevant_code = [documents for documents in code_documents if documents.metadata['file_type'] not in doc_ignore_types]
relevant_code = code_documents

In [12]:
content = []
for doc in relevant_code:
    content.append(len(doc.page_content))    
print(sorted(content))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 18, 30, 34, 34, 34, 48, 53, 59, 69, 69, 69, 70, 70, 70, 70, 70, 71, 74, 76, 78, 79, 81, 84, 84, 85, 85, 85, 85, 85, 88, 88, 88, 89, 89, 89, 89, 89, 92, 93, 94, 94, 98, 98, 99, 99, 100, 101, 101, 103, 104, 105, 106, 107, 108, 108, 110, 110, 111, 114, 116, 118, 118, 119, 120, 120, 120, 121, 121, 121, 122, 123, 124, 126, 126, 126, 127, 127, 127, 127, 127, 128, 129, 131, 136, 137, 137, 137, 138, 140, 141, 142, 143, 143, 143, 143, 144, 144, 145, 145, 146, 148, 148, 148, 150, 150, 151, 152, 152, 153, 153, 155, 156, 158, 158, 159, 159, 159, 160, 161, 161, 162, 163, 163, 164, 164, 164, 164, 164, 166, 168, 169, 171, 171, 172, 173, 173, 173, 175, 175, 175, 176, 177, 179, 179, 180, 180, 180, 181, 181, 181, 181, 183, 183, 184, 184, 184, 185, 185, 185, 185, 185, 186, 187, 187, 187, 188, 188, 189, 189, 190, 190, 190, 191, 191, 191, 193, 194, 194, 194, 194, 195, 195, 195, 196, 197, 197, 198, 199, 200, 202, 203, 204, 205, 206, 206, 207, 207, 207,

## As we can see, some of the documents are very long to be included in an LLM prompt
The content will have to be compressed or split before it ever interacts with a language model.

In [13]:
# We split the code using a default code splitter.
# In this case, it is likely to work well enough for the main filetypes in the codebase. 
# Otherwise, separators for other langauges can easily be defined here. 

from langchain.text_splitter import RecursiveCharacterTextSplitter

scala_splitter= RecursiveCharacterTextSplitter(chunk_size=800)
java_splitter= RecursiveCharacterTextSplitter(chunk_size=800)
python_splitter= RecursiveCharacterTextSplitter(chunk_size=800)
markdown_splitter= RecursiveCharacterTextSplitter(chunk_size=800)
generic_splitter= RecursiveCharacterTextSplitter(chunk_size=1000)

scala_splitter.from_language('scala')
java_splitter.from_language('java')
python_splitter.from_language('python')
markdown_splitter.from_language('markdown')


<langchain.text_splitter.RecursiveCharacterTextSplitter at 0x125f25e10>

## Data preparation
Any usecase for langchain in coding applications will involve a degree of pre-processing that has to be more involved than simply splitting chunks of natural language into paragraphs. Embeddings will not work as well and retrievers should ideally be "smarter" than the default vectorstore retrievers that are used if we are to get high-quality, abstract answers to sometimes unclear relationships between files from a codebase.

In this example, we use different splitters for some of the different languages present in the codebase.
For reference, a splitter for a language not yet supported by langchain could look something like this:

In [14]:
# solidity_splitter= RecursiveCharacterTextSplitter(
#     separators=[
#             "\nContract ",
#             "\nInterface ",
#             "\nLibrary ",
#             "\n\tfunction ",
#             "\n\tstruct ",
#             "\n\tevent ",
#             "\n\n",
#             "\n",
#             " ",
#             "",
#             ]
#             ,chunk_size=800)

## Splitting every language manually
This separate>split>merge process can also be built into a function that looks at the character count per file extension as shown above, but we will do it manually in this example 

In [15]:
scala_code = [doc for doc in relevant_code if doc.metadata['file_type'] == '.scala']
java_code = [doc for doc in relevant_code if doc.metadata['file_type'] == '.java']
python_code = [doc for doc in relevant_code if doc.metadata['file_type'] == '.py']
markdown_code = [doc for doc in relevant_code if doc.metadata['file_type'] == '.md']
other_code = [doc for doc in relevant_code if doc.metadata['file_type'] not in ['.scala', '.java', '.py', '.md']]

In [16]:
scala_code_split = scala_splitter.split_documents(scala_code)
java_code_split = java_splitter.split_documents(java_code)
python_code_split = python_splitter.split_documents(python_code)
markdown_code_split = markdown_splitter.split_documents(markdown_code)
other_code_split = generic_splitter.split_documents(other_code)

In [17]:
relevant_code_split = [*scala_code_split, *java_code_split, *python_code_split, *markdown_code_split, *other_code_split]


In [18]:
lengths = []
for i in range(len(relevant_code_split)):
    if type(relevant_code_split[i]) != list:
        lengths.append(len(relevant_code_split[i].page_content))
    elif type(relevant_code_split[i]) == list:
        print(len(relevant_code_split[i]))

In [19]:
# Just making sure our splitters actually worked

content = [len(d.page_content) for d in relevant_code_split if len(d.page_content) > 999]
print(sorted(content))

[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]


## Retriever Choice
Each vectorstore db will behave a bit differently. 
Langchain offers us great flexibility in retriever choice.
Now, we'll define a self_querying retriever which will be able to use the original query/question written by the user(or other LLM further up the chain) to get better results. The main advantage here is taking advantage of the document's metadata in some instances.  

In [20]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

chroma_code = Chroma.from_documents(relevant_code_split, embedding=embeddings,persist_directory='./chroma_code/split')

The process above may take a few minutes.

But once we do it once, we can load the persisted database from disk, and use it as normal.

In [26]:
# embeddings = OpenAIEmbeddings()
# persist_directory = "./chroma_code/split/index"
# chroma_code = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

## Self-querying retriever example

In [21]:
from langchain.callbacks import get_openai_callback
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI

gpt4 = ChatOpenAI(model_name="gpt-4",temperature=0) # Waitlist only
gpt3 = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)
davinci = OpenAI(model_name="text-davinci-003",temperature=0)
davinci002 = OpenAI(model_name="text-davinci-002",temperature=0)
curie = OpenAI(model_name="text-curie-001",temperature=0)


In [30]:
base_retriever = chroma_code.as_retriever()
base_retriever.search_kwargs['distance_metric'] = 'cos'
base_retriever.search_kwargs['fetch_k'] = 10 
base_retriever.search_kwargs['maximal_marginal_relevance'] = True
base_retriever.search_kwargs['k'] = 4 

In [37]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info=[
    AttributeInfo(
        name="source",
        description="Exact path to code in codebase, in the format 'folder/folder/file_name.file_type'", 
        type="string", 
    ),
    AttributeInfo(
        name="file_path",
        description="Exact path to code in codebase, in the format 'folder/folder/file_name.file_type'", 
        type="string", 
    ),
    AttributeInfo(
        name="file_name",
        description="The name of the file, with extension" , 
        type="string", 
    ),
    AttributeInfo(
        name="file_type",
        description="The file extension, in the format .file_type",
        type="string"
    )
]
document_content_description = "source code in different languages from a code repository"

smart_retriever = SelfQueryRetriever.from_llm(gpt4, chroma_code, document_content_description, metadata_field_info,enable_limit=True,verbose=True)

In [38]:
smart_query = smart_retriever.get_relevant_documents("What does favCountParams do?")

query='favCountParams' filter=None limit=None


In [39]:
[print(doc.metadata['file_name']) for doc in smart_query]

CrMixerParamConfig.scala
InNetworkTweetProduction.scala
RecapProduction.scala
LinearScoringParams.java


[None, None, None, None]

In [41]:
smart_query = smart_retriever.get_relevant_documents("Explain CrMixerParamConfig.scala")

query='Explain CrMixerParamConfig.scala' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='file_name', value='CrMixerParamConfig.scala') limit=None


In [42]:
[print(doc.metadata['file_name']) for doc in smart_query]

CrMixerParamConfig.scala
CrMixerParamConfig.scala
CrMixerParamConfig.scala
CrMixerParamConfig.scala


[None, None, None, None]

As we can see, we can employ the smart retriever to filter documents by metadata from natural language. Useful when asking about a specific file, especially when k>(# of files under that tag)

In [43]:
direct_query = chroma_code.as_retriever().get_relevant_documents("Explain CrMixerParamConfig.scala")

In [44]:
[print(doc.metadata['file_name']) for doc in direct_query]

CrMixerParamConfig.scala
EarlybirdFrsBasedCandidateGenerationParams.scala
CrMixerParamConfig.scala
ScoredTweetsParam.scala


[None, None, None, None]

When the smart retriever is used alongside MMR and other filtering tools, it has a good likelihood of returning the relevant file and relevant imports. Not deterministic though.

In [45]:
from langchain.chains import FlareChain
import langchain
langchain.verbose = True

code_flare = FlareChain.from_llm(
    davinci, 
    retriever=smart_retriever,
    max_generation_len=164,
    min_prob=.36,
    max_iter=3,
    start_with_retrieval=True,
)

In [46]:
code_flare.run('Explain CrMixerParamConfig.scala?')



[1m> Entering new FlareChain chain...[0m
[36;1m[1;3mCurrent Response: [0m
Prompt after formatting:
[32;1m[1;3mRespond to the user message using any relevant context. If context is provided, you should ground your answer in that context. Once you're done responding return FINISHED.

>>> CONTEXT: 
>>> USER INPUT: Explain CrMixerParamConfig.scala?
>>> RESPONSE: [0m


[1m> Entering new QuestionGeneratorChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven a user input and an existing partial response as context, ask a question to which the answer is the given term/entity/phrase:

>>> USER INPUT: Explain CrMixerParamConfig.scala?
>>> EXISTING PARTIAL RESPONSE:  
CrMixerParamConfig.scala is a Scala file that contains configuration parameters for the CrMixer, a tool used to mix and match different types of data. It allows users to define the parameters for the data they want to mix, such as the number of samples, the size of the data set, and the type of data. FINISHED

Th

' CrMixerParamConfig.scala is a Scala file that contains parameters for the CrMixer system. It includes parameters for topics, candidate generation, user ad graphs, user tweet graphs, user video graphs, tweet shares, TwHIN, real graph Oon, good tweet clicks, good profile clicks, Uteg tweet global, video tweet filter, video view tweets, and unified USS signal. '

' CrMixerParamConfig.scala is a Scala file that contains parameters for the CrMixer system. It includes parameters for topics, candidate generation, user ad graphs, user tweet graphs, user video graphs, tweet shares, TwHIN, real graph Oon, good tweet clicks, good profile clicks, Uteg tweet global, video tweet filter, video view tweets, and unified USS signal. 

Flare + smart_retriever shines when asked about specific files. The reason is, the original query should return enough(or all) of the relevant docs by a given name, and the subsequent questions generated by the LLM usually do a good job of fetching the remaining context to a file.

None of this is deterministic, but good retriever context is essential to getting good output from the LLMs

## Now let's compare this setup with retrievalQA + base retriever

## Flare + Smart retriever (davinci)

In [47]:
with get_openai_callback() as cb:
    flare_answer = code_flare.run('What does favCountParams do?')
    print(cb)



[1m> Entering new FlareChain chain...[0m
[36;1m[1;3mCurrent Response: [0m
Prompt after formatting:
[32;1m[1;3mRespond to the user message using any relevant context. If context is provided, you should ground your answer in that context. Once you're done responding return FINISHED.

>>> CONTEXT: 
>>> USER INPUT: What does favCountParams do?
>>> RESPONSE: [0m


[1m> Entering new QuestionGeneratorChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven a user input and an existing partial response as context, ask a question to which the answer is the given term/entity/phrase:

>>> USER INPUT: What does favCountParams do?
>>> EXISTING PARTIAL RESPONSE:  
favCountParams is a parameter used to count the number of favorites a post has. It is used to track the popularity of a post. FINISHED

The question to which the answer is the term/entity/phrase " count the number" is:[0m
Prompt after formatting:
[32;1m[1;3mGiven a user input and an existing partial response as context,

In [48]:
print(flare_answer)

 favCountParams is used to set the minimum value for the favorite count of a tweet. If the favorite count of a tweet is below this minimum value, the tweet will be skipped. 


favCountParams is used to set the minimum value for the favorite count of a tweet. If the favorite count of a tweet is below this minimum value, the tweet will be skipped. 

### Let's improve things a bit:

## RetrievalQA + base retriever (gpt-3.5-turbo)

In [61]:
from langchain.chains import RetrievalQA

code_qa = RetrievalQA.from_chain_type(llm=gpt3,chain_type='stuff', retriever=base_retriever)

In [62]:
code_qa.run('What does favCountParams do?')



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
videoViewWeight = params.isSetVideoViewCountParams()
        ? params.getVideoViewCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
    quotedCountWeight = params.isSetQuotedCountParams()
        ? params.getQuotedCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;

val doubleParams: Seq[MaxCountMultiplierParam.type] = Seq(
    MaxCountMultiplierParam
  )

  val booleanDeciderParams: Seq[EnableContentFeaturesHydrationParam.type] = Seq(
    EnableContentFeaturesHydrationParam
  )

  val booleanFeatureSwitchParams: Seq[FSParam[Boolean]] = Seq(
    EnableExcludeSourceTweetIdsQueryParam,
    EnableTokensInContentFeaturesHydration

"I'm sorry, I cannot answer your question as there is no mention of `favCountParams` in the given context."

I'm sorry, I cannot answer your question as there is no mention of `favCountParams` in the given context.

We need our retriever to return more documents for retrievalQA to have more context in this example, which was not an issue with Flare because it implicitly fetched more documents by asking new questions

In [70]:
base_retriever = chroma_code.as_retriever()
base_retriever.search_kwargs['distance_metric'] = 'cos'
base_retriever.search_kwargs['fetch_k'] = 30 
base_retriever.search_kwargs['maximal_marginal_relevance'] = True
base_retriever.search_kwargs['k'] = 10 

In [76]:
code_qa = RetrievalQA.from_chain_type(llm=gpt3,chain_type='stuff', retriever=base_retriever)

In [77]:
with get_openai_callback() as cb:
    qa_answer = code_qa.run('What does favCountParams do?')
    print(cb)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
videoViewWeight = params.isSetVideoViewCountParams()
        ? params.getVideoViewCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
    quotedCountWeight = params.isSetQuotedCountParams()
        ? params.getQuotedCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;

val doubleParams: Seq[MaxCountMultiplierParam.type] = Seq(
    MaxCountMultiplierParam
  )

  val booleanDeciderParams: Seq[EnableContentFeaturesHydrationParam.type] = Seq(
    EnableContentFeaturesHydrationParam
  )

  val booleanFeatureSwitchParams: Seq[FSParam[Boolean]] = Seq(
    EnableExcludeSourceTweetIdsQueryParam,
    EnableTokensInContentFeaturesHydration

In [78]:
print(qa_answer)

favCountParams is a ThriftLinearFeatureRankingParams that is used to determine the weight of the favorite count feature in a ranking algorithm. It is one of several parameters that can be used to adjust the weights of different features in the algorithm.


favCountParams is a ThriftLinearFeatureRankingParams that is used to determine the weight of the favorite count feature in a ranking algorithm. It is one of several parameters that can be used to adjust the weights of different features in the algorithm.

By giving the model more documents to work with, it arrived at a much better answer for cheaper. Let's tweak things a bit more:

## Flare + Smart retriever (davinci)

In [93]:
code_flare = FlareChain.from_llm(
    davinci, 
    retriever=smart_retriever,
    max_generation_len=164,
    min_prob=.36,
    max_iter=10,
    start_with_retrieval=True,
)

In [94]:
with get_openai_callback() as cb:
    flare_answer = code_flare.run('What does favCountParams do?')
    print(cb)



[1m> Entering new FlareChain chain...[0m
[36;1m[1;3mCurrent Response: [0m
Prompt after formatting:
[32;1m[1;3mRespond to the user message using any relevant context. If context is provided, you should ground your answer in that context. Once you're done responding return FINISHED.

>>> CONTEXT: 
>>> USER INPUT: What does favCountParams do?
>>> RESPONSE: [0m


[1m> Entering new QuestionGeneratorChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven a user input and an existing partial response as context, ask a question to which the answer is the given term/entity/phrase:

>>> USER INPUT: What does favCountParams do?
>>> EXISTING PARTIAL RESPONSE:  
favCountParams is a parameter used to count the number of favorites a post has. It is used to track the popularity of a post. FINISHED

The question to which the answer is the term/entity/phrase " count the number" is:[0m
Prompt after formatting:
[32;1m[1;3mGiven a user input and an existing partial response as context,

In [96]:
print(flare_answer)

 favCountParams is a parameter used to determine the minimum number of favorites a tweet must have in order to be eligible for social proof. If the tweet has fewer than the minimum number of favorites, it will be skipped. 


favCountParams is a parameter used to determine the minimum number of favorites a tweet must have in order to be eligible for social proof. If the tweet has fewer than the minimum number of favorites, it will be skipped.

As we can see, a lot of the answer generated by the LLM comes down to the context that is passed to it. Therefore, working on retrieving sufficient documents is fundamental to getting the correct answer.

## Flare + base retriever (davinci)

In [97]:
code_flare = FlareChain.from_llm(
    davinci, 
    retriever=base_retriever,
    max_generation_len=164,
    min_prob=.36,
    max_iter=5,
    start_with_retrieval=True,
)

In [98]:
with get_openai_callback() as cb:
    flare_answer = code_flare.run('What does favCountParams do?')
    print(cb)



[1m> Entering new FlareChain chain...[0m
[36;1m[1;3mCurrent Response: [0m
Prompt after formatting:
[32;1m[1;3mRespond to the user message using any relevant context. If context is provided, you should ground your answer in that context. Once you're done responding return FINISHED.

>>> CONTEXT: 
>>> USER INPUT: What does favCountParams do?
>>> RESPONSE: [0m


[1m> Entering new QuestionGeneratorChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven a user input and an existing partial response as context, ask a question to which the answer is the given term/entity/phrase:

>>> USER INPUT: What does favCountParams do?
>>> EXISTING PARTIAL RESPONSE:  
favCountParams is a parameter used to count the number of favorites a post has. It is used to track the popularity of a post. FINISHED

The question to which the answer is the term/entity/phrase " count the number" is:[0m
Prompt after formatting:
[32;1m[1;3mGiven a user input and an existing partial response as context,

In [99]:
print(flare_answer)

 favCountParams is a ThriftLinearFeatureRankingParams that sets the weight for the favCount feature. This feature is used to score tweets based on the number of favorites they have. 


favCountParams is a ThriftLinearFeatureRankingParams that sets the weight for the favCount feature. This feature is used to score tweets based on the number of favorites they have. 

### Most expensive way:

We will give 25 documents to gpt4, one by one, and ask it to refine its answer.

In [100]:
base_retriever = chroma_code.as_retriever()
base_retriever.search_kwargs['distance_metric'] = 'cos'
base_retriever.search_kwargs['fetch_k'] = 50 
base_retriever.search_kwargs['maximal_marginal_relevance'] = True
base_retriever.search_kwargs['k'] = 25 

In [101]:
code_qa = RetrievalQA.from_chain_type(llm=gpt4,chain_type='refine', retriever=base_retriever)

## Avoid running this one, as it may cost close to $0.50


In [102]:
with get_openai_callback() as cb:
    qa_answer = code_qa.run('What does favCountParams do?')
    print(cb)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Context information is below. 
---------------------
videoViewWeight = params.isSetVideoViewCountParams()
        ? params.getVideoViewCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
    quotedCountWeight = params.isSetQuotedCountParams()
        ? params.getQuotedCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
---------------------
Given the context information and not prior knowledge, answer any questions
Human: What does favCountParams do?[0m

[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mHuman: What does favCountParams do?
AI: There is no information about "favCountParams" in the given context. It might be related to a favorite count parameter, but without more context or information, it's impossible to provide a clear explana

Tokens Used: 12088
	Prompt Tokens: 8546
	Completion Tokens: 3542
Successful Requests: 25
Total Cost (USD): $0.46889999999999993

In [103]:
print(qa_answer)

Based on the additional context provided, `favCountParams` is not explicitly mentioned in the code snippet. However, we can infer that the code is related to counting the number of favorites for each tweet.

The code first creates a table `user_tweet_fav_pairs` that groups the favorite and unfavorite events by user and tweet, and then selects the most recent event for each pair. It also counts the number of events for each pair.

Next, it creates another table `tweet_raw_favs_table` that filters the events to only include those with less than 3 events and where the most recent event is a favorite (favOrUnfav = 1).

Finally, it counts the number of distinct users who have favorited each tweet and groups the results by tweetId.

Although `favCountParams` is not explicitly mentioned, it could be related to the parameters used in the query to filter, count, and group the favorite events. It might be used to influence the processing of these events or the way they are counted and stored.


Based on the additional context provided, `favCountParams` is not explicitly mentioned in the code snippet. However, we can infer that the code is related to counting the number of favorites for each tweet.

The code first creates a table `user_tweet_fav_pairs` that groups the favorite and unfavorite events by user and tweet, and then selects the most recent event for each pair. It also counts the number of events for each pair.

Next, it creates another table `tweet_raw_favs_table` that filters the events to only include those with less than 3 events and where the most recent event is a favorite (favOrUnfav = 1).

Finally, it counts the number of distinct users who have favorited each tweet and groups the results by tweetId.

Although `favCountParams` is not explicitly mentioned, it could be related to the parameters used in the query to filter, count, and group the favorite events. It might be used to influence the processing of these events or the way they are counted and stored.

Giving excess context to gpt-4 was both expensive and did not generate results as impressive as one would expect

## Working on the retriever 

### We can play around with many ways of retrieving the docs

1. Self-querying (smart) retriever: injects into query metadata what it sees as a relevant search term.
2. L2: use euclidean similarity for getting more exact "mentions" of the term in the codebase.
3. Cosine_similarity: generally recommended for dealing with this type of document. 

In [104]:
returned_docs = chroma_code.similarity_search('what does favCountParams do?',k=50,search_kwargs={'distance_metric':'L2','fetch_k':50})

In [105]:
Counter([(doc.metadata['file_name']) for doc in returned_docs])

Counter({'LinearScoringParams.java': 5,
         'UtegLikedByTweetsParams.scala': 5,
         'PushFeatureSwitchParams.scala': 5,
         'CrMixerParamConfig.scala': 4,
         'TweetCountsHydrator.scala': 3,
         'tweet_fav_count.sql': 2,
         'InNetworkTweetProduction.scala': 2,
         'RecapProduction.scala': 2,
         'InNetworkTweetParams.scala': 2,
         'RecapParams.scala': 2,
         'TweetCountsCacheUpdatingStore.scala': 2,
         'RecentTweetFavoritesParams.scala': 1,
         'UserUserFavGraph.scala': 1,
         'FeatureBasedScoringFunction.java': 1,
         'ranking.thrift': 1,
         'HomeGlobalParams.scala': 1,
         'TopicSocialProofHandler.scala': 1,
         'ContentRecommenderFlowFSConfig.scala': 1,
         'LinearScoringFunction.java': 1,
         'TweetBasedUserVideoGraphParams.scala': 1,
         'UtegLikedByTweetsProduction.scala': 1,
         'InteractionGraphAggDirectInteractionsUtil.scala': 1,
         'EarlybirdResponseUtil.scala': 

In [106]:
euclid_base_retriever = chroma_code.as_retriever(search_type='similarity',search_kwargs={'fetch_k':100,'k':50,'distance_metric':'L2','maximal_marginal_relevance':True})
# base_retriever = chroma_code.as_retriever(search_type='mmr',search_kwargs={'fetch_k':50,'k':50})
# base_retriever = chroma_code.as_retriever(search_type='mmr',search_kwargs={'fetch_k':50,'k':50})

In [107]:
returned_docs = euclid_base_retriever.get_relevant_documents('what does favCountParams do?')
# returned_docs = euclid_base_retriever.get_relevant_documents('LinearScoringParams?')

In [109]:
Counter([(doc.metadata['file_name']) for doc in returned_docs])

Counter({'LinearScoringParams.java': 5,
         'UtegLikedByTweetsParams.scala': 5,
         'PushFeatureSwitchParams.scala': 5,
         'CrMixerParamConfig.scala': 4,
         'TweetCountsHydrator.scala': 3,
         'tweet_fav_count.sql': 2,
         'InNetworkTweetProduction.scala': 2,
         'RecapProduction.scala': 2,
         'InNetworkTweetParams.scala': 2,
         'RecapParams.scala': 2,
         'TweetCountsCacheUpdatingStore.scala': 2,
         'RecentTweetFavoritesParams.scala': 1,
         'UserUserFavGraph.scala': 1,
         'FeatureBasedScoringFunction.java': 1,
         'ranking.thrift': 1,
         'HomeGlobalParams.scala': 1,
         'TopicSocialProofHandler.scala': 1,
         'ContentRecommenderFlowFSConfig.scala': 1,
         'LinearScoringFunction.java': 1,
         'TweetBasedUserVideoGraphParams.scala': 1,
         'UtegLikedByTweetsProduction.scala': 1,
         'InteractionGraphAggDirectInteractionsUtil.scala': 1,
         'EarlybirdResponseUtil.scala': 

In [110]:
# Helper function for printing docs

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

### At this stage, we can play along with using a pipeline compressor to filter in multiple steps

In [111]:
base_retriever = chroma_code.as_retriever()
base_retriever.search_kwargs['distance_metric'] = 'L2'
base_retriever.search_kwargs['fetch_k'] = 50 
base_retriever.search_kwargs['maximal_marginal_relevance'] = True
base_retriever.search_kwargs['k'] = 25 

In [112]:
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.document_compressors import EmbeddingsFilter

compressor = LLMChainFilter.from_llm(davinci002)
# compressor = LLMChainFilter.from_llm(gpt3)
# compressor = LLMChainExtractor.from_llm(gpt4)
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=base_retriever)

from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter

relevant_filter = EmbeddingsFilter(embeddings=OpenAIEmbeddings(), similarity_threshold=0.72,k=20)
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[relevant_filter]
)
compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=euclid_base_retriever)


In [113]:
returned_docs = compression_retriever.get_relevant_documents('what does favCountParams do?')

In [114]:
pretty_print_docs(returned_docs)

Document 1:

videoViewWeight = params.isSetVideoViewCountParams()
        ? params.getVideoViewCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
    quotedCountWeight = params.isSetQuotedCountParams()
        ? params.getQuotedCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
----------------------------------------------------------------------------------------------------
Document 2:

case _: FavsKey => StatusCounts(favoriteCount = Some(count))
            case _: QuotesKey => StatusCounts(quoteCount = Some(count))
            case _: BookmarksKey => StatusCounts(bookmarkCount = Some(count))
          }
        )
----------------------------------------------------------------------------------------------------
Document 3:

/**
   * Minimum number of favorited-by users required for recommended tweets.
   */
  object MinNumFavoritedByUserIdsParam extends Param(1)

  /**
   * Includes one or multiple random tweets in the response.
   */
  object IncludeRandomTweetParam
      exte

In [115]:
Counter([(doc.metadata['file_name']) for doc in returned_docs])

Counter({'LinearScoringParams.java': 3,
         'UtegLikedByTweetsParams.scala': 2,
         'tweet_fav_count.sql': 2,
         'CrMixerParamConfig.scala': 2,
         'PushFeatureSwitchParams.scala': 2,
         'TweetCountsHydrator.scala': 1,
         'RecentTweetFavoritesParams.scala': 1,
         'UserUserFavGraph.scala': 1,
         'FeatureBasedScoringFunction.java': 1,
         'InNetworkTweetProduction.scala': 1,
         'ranking.thrift': 1,
         'HomeGlobalParams.scala': 1,
         'TopicSocialProofHandler.scala': 1,
         'ContentRecommenderFlowFSConfig.scala': 1})

In [120]:
from langchain.chains import RetrievalQA 
# from langchain.chains import RetrievalQA
code_qa = RetrievalQA.from_chain_type(llm=gpt4,retriever=compression_retriever,chain_type='stuff')

In [121]:
code_qa.run('what does favCountParams do?')



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
videoViewWeight = params.isSetVideoViewCountParams()
        ? params.getVideoViewCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
    quotedCountWeight = params.isSetQuotedCountParams()
        ? params.getQuotedCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;

case _: FavsKey => StatusCounts(favoriteCount = Some(count))
            case _: QuotesKey => StatusCounts(quoteCount = Some(count))
            case _: BookmarksKey => StatusCounts(bookmarkCount = Some(count))
          }
        )

/**
   * Minimum number of favorited-by users required for recommended tweets.
   */
  object MinNumFavoritedByUserIdsParam extends 

'`favCountParams` is an optional parameter of type `ThriftLinearFeatureRankingParams`. It is used to configure the weight and other settings related to the favorite count feature in the ranking process. The favorite count represents the number of times a tweet has been favorited by users. By adjusting the `favCountParams`, the ranking algorithm can give more or less importance to the favorite count when determining the order of tweets or recommendations.'

`favCountParams` is an optional parameter of type `ThriftLinearFeatureRankingParams`. It is used to configure the weight and other settings related to the favorite count feature in the ranking process. The favorite count represents the number of times a tweet has been favorited by users. By adjusting the `favCountParams`, the ranking algorithm can give more or less importance to the favorite count when determining the order of tweets or recommendations.

In [122]:
code_qa.run('Explain CrMixerParamConfig.scala')



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
package com.twitter.cr_mixer.param

import com.twitter.timelines.configapi.CompositeConfig
import com.twitter.timelines.configapi.FSName
import com.twitter.timelines.configapi.Param

object CrMixerParamConfig {

package com.twitter.cr_mixer.param

TopicTweetParams.config,
      TweetBasedCandidateGenerationParams.config,
      TweetBasedUserAdGraphParams.config,
      TweetBasedUserTweetGraphParams.config,
      TweetBasedUserVideoGraphParams.config,
      TweetSharesParams.config,
      TweetBasedTwHINParams.config,
      RealGraphOonParams.config,
      GoodTweetClickParams.config,
      GoodProfileClickParams.config,
      U

"`CrMixerParamConfig.scala` is a Scala source file that defines the configuration parameters for the CrMixer (Content Recommendation Mixer) module in a Twitter application. The CrMixer module is responsible for generating and managing content recommendations for users based on various factors, such as user interests, tweet-based candidate generation, and other signals.\n\nThe file contains several objects and classes that define the configuration parameters for different aspects of the CrMixer module. Some of the key objects and parameters defined in this file include:\n\n1. `CrMixerParamConfig`: This object defines a composite configuration for the CrMixer module, which includes configurations for various sub-modules like TopicTweetParams, TweetBasedCandidateGenerationParams, and others.\n\n2. `CrMixerSource`, `FrsTweetSource`, `InNetworkSource`, and `UtegSource`: These objects define the configuration parameters for enabling or disabling different candidate pipelines for content reco

`CrMixerParamConfig.scala` is a Scala source file that defines the configuration parameters for the CrMixer (Content Recommendation Mixer) module in a Twitter application. The CrMixer module is responsible for generating and managing content recommendations for users based on various factors, such as user interests, tweet-based candidate generation, and other signals.\n\nThe file contains several objects and classes that define the configuration parameters for different aspects of the CrMixer module. Some of the key objects and parameters defined in this file include:\n\n1. `CrMixerParamConfig`: This object defines a composite configuration for the CrMixer module, which includes configurations for various sub-modules like TopicTweetParams, TweetBasedCandidateGenerationParams, and others.\n\n2. `CrMixerSource`, `FrsTweetSource`, `InNetworkSource`, and `UtegSource`: These objects define the configuration parameters for enabling or disabling different candidate pipelines for content recommendations.\n\n3. `QualityFactor`: This object contains the `MaxTweetsToScoreParam` parameter, which defines the maximum number of tweets to score for quality factors.\n\n4. `NumberOfMaxCrMixerCandidatesParam`: This parameter defines the maximum number of CrMixer candidates to send.\n\n5. `MinDurationSincePushParam`: This parameter defines the minimum duration between two MR (MapReduce) pushes.\n\n6. `CrMixerParamConfigModule`: This object provides a Guice module that binds the CrMixerParamConfig configuration to the application's dependency injection framework.\n\nOverall, `CrMixerParamConfig.scala` is responsible for defining and managing the configuration parameters for the CrMixer module, which plays a crucial role in generating content recommendations for Twitter users.

In [124]:
returned_docs = compression_retriever.get_relevant_documents('Explain CrMixerParamConfig.scala')

In [125]:
Counter([(doc.metadata['file_name']) for doc in returned_docs])

Counter({'CrMixerParamConfig.scala': 2,
         'ScoredTweetsParam.scala': 2,
         'EarlybirdFrsBasedCandidateGenerationParams.scala': 1,
         'BlenderParams.scala': 1,
         'ProducerBasedCandidateGenerationParams.scala': 1,
         'HomeMixerInjectionNames.scala': 1,
         'PushFeatureSwitchParams.scala': 1,
         'TopicTweetParams.scala': 1,
         'RepeatedProfileVisitsParams.scala': 1,
         'AdsCandidateSourcesRouter.scala': 1,
         'CrMixerServer.scala': 1,
         'MixerPipelineBuilder.scala': 1,
         'MixerPipelineBuilderFactory.scala': 1,
         'MixerPipelineConfig.scala': 1,
         'MixerPipelineResult.scala': 1,
         'GlobalParams.scala': 1,
         'CrMixerParamConfigModule.scala': 1,
         'FrsParams.scala': 1})

That answer was generated with only two documents of context for the actual file

In [None]:

metadata_field_info=[
    AttributeInfo(
        name="source",
        description="Exact path to code in codebase, in the format 'folder/folder/file_name.file_type'", 
        type="string", 
    ),
    AttributeInfo(
        name="file_path",
        description="Exact path to code in codebase, in the format 'folder/folder/file_name.file_type'", 
        type="string", 
    ),
    AttributeInfo(
        name="file_name",
        description="The name of the file, in the format file_name.file_type", 
        type="string", 
    ),
    AttributeInfo(
        name="file_type",
        description="The file extension, in the format .file_type",
        type="string"
    )
]
document_content_description = "source code in different languages from a code repository"

smart_retriever = SelfQueryRetriever.from_llm(gpt4, chroma_code, document_content_description, metadata_field_info,enable_limit=True,verbose=True)
# smart_retriever.search_kwargs['limit'] = 100

One big thing to note here is that one can pass the filters in natural language too

In [53]:
returned_docs = smart_retriever.get_relevant_documents('Explain CrMixerParamConfig.scala, retrieval limit=2')

query='Explain CrMixerParamConfig.scala' filter=None limit=2


In [54]:
Counter([(doc.metadata['file_name']) for doc in returned_docs])

Counter({'CrMixerParamConfig.scala': 1,
         'EarlybirdFrsBasedCandidateGenerationParams.scala': 1})

In subsequent notebooks, we will expand on this. 

### A few ideas to keep in mind
* How do we expand the metadata field to allow the LLM tied to the self-querying retriever to fetch the "correct" files most of the time, instead of relying only on cosine similarity?
* Can we use manual parsing or a "dry LLM summarizer run" through the whole repo to enrich the metadata in a way that can be used by other llms/agents down the chain?
* Can we store the repository structure in a JSON format and have a JSON agent iterate through it for certain questions that can be intuited from file/folder names?
* Once we have summarizer tools, repo navigation tools, QA tools, can we have an agent generate meaningful output most of the time?
* Can we use ASTs or tree-sitter-like tools to selectively enrich the LLM's prompt (or metaprompt function?)
* Can we add memory to the agent instance so that as it answers questions, it gains a better understanding of the repository?
