In [1]:
%pip install -q langchain
%pip install -q openai


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
from langchain import OpenAI, LLMChain, PromptTemplate
import numpy as np
import openai
from os import listdir
from os.path import isfile

EMBEDDING_MODEL = "text-embedding-ada-002"


Get embedding for all file in the given folder

In [12]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(folder):
    """
    Create an embedding for each file in given folder using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r['context']) for idx, r in enumerate(folder)
    }

Calculate the similarity between the question and document embeddings

In [13]:
def vector_similarity(x , y):
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts ):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

Add the most relevant document to the promt

In [18]:
max_given_infor_len = 2500
def get_relevant_document(question: str, context_embeddings: dict, folder:dict):
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = ""
    chosen_filenames = []

     
    for _, section_index in most_relevant_document_sections:
        chosen_filenames.append(folder[section_index]['name'])
        if len(chosen_sections.split(' ')) + len(folder[section_index]['context'].split(' ')) < max_given_infor_len:
            chosen_sections += folder[section_index]['context']
        else:
            max_additional_len = max_given_infor_len - len(chosen_sections.split(' '))
            chosen_sections += ' '.join(folder[section_index]['context'].split(' ')[:max_additional_len])
            break
            
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_filenames)} file")
    for filename in chosen_filenames:
        print(filename)
    # print("\n".join(chosen_sections_indexes))
    
    
    return chosen_sections

Main

In [15]:
max_len = 3500
def get_all_files(path):
    files = []
    dirs = [path]
    for dir in dirs:
        # print(dirs)
        for f in listdir(dir):
            if '.git' in f:
                continue
            if isfile(dir +'/'+f):
                files.append(dir +'/'+f)
            else:
                dirs.append(dir +'/'+f)
    return files

def read_folder(path):
    files = get_all_files(path)
    context_folder = []
    for file in files:
        if '.pyc' in file:
            continue
        if ('.py' or '.txt' or '.sh') in file:
            with open(file, 'r') as f:
                c = f.read()
                c = c.replace('\n', ' ')
                id = 0
                while id < len(c):
                    context = {}
                    context['name'] = file
                    if id+max_len < len(c):
                        context['context'] = c[id:id+max_len]
                        id += max_len
                    else:
                        context['context'] = c[id:]
                        id=len(c)
                    context_folder.append(context)
    return context_folder

In [16]:
template = """This assistant is capable of answering questions located in a designated folder. 
By analyzing the context of the question, it can identify the most relevant information and provide an accurate answer.
Context: {document}
Question: {question_input}
Answer:"""

prompt = PromptTemplate(
    input_variables=["document", "question_input"], 
    template=template
)


chatgpt_chain = LLMChain(
    llm=OpenAI(temperature=0), 
    prompt=prompt, 
    verbose=True
)
context_folder = read_folder('./Docify-Lab-AIC/')
# for r in context_folder:
#     print(r)
context_embeddings=compute_doc_embeddings(context_folder)


In [19]:
question = "what are the necessary library for this repository"
relevant_document = get_relevant_document(question=question,context_embeddings=context_embeddings, folder=context_folder)
output = chatgpt_chain.predict(document=relevant_document, question_input=question)
print(output)

Selected 3 file
./Docify-Lab-AIC//training/generation_encoder_decoder.py
./Docify-Lab-AIC//streamlit_app.py
./Docify-Lab-AIC//training/generation.py


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThis assistant is capable of answering questions located in a designated folder. 
By analyzing the context of the question, it can identify the most relevant information and provide an accurate answer.
Question: what are the necessary library for this repository
Answer:[0m

[1m> Finished chain.[0m
 The necessary libraries for this repository are json, logging, math, os, sys, time, random, dataclasses, enum, itertools, pathlib, typing, datasets, jax, jax.numpy, transformers, argparse, and prompt.


In [20]:
question = "How can we use expander in streamlit"
relevant_document = get_relevant_document(question=question,context_embeddings=context_embeddings, folder=context_folder)
output = chatgpt_chain.predict(document=relevant_document, question_input=question)
print(output)

Selected 3 file
./Docify-Lab-AIC//streamlit_app.py
./Docify-Lab-AIC//streamlit_app.py
./Docify-Lab-AIC//streamlit_app.py


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThis assistant is capable of answering questions located in a designated folder. 
By analyzing the context of the question, it can identify the most relevant information and provide an accurate answer.
Question: How can we use expander in streamlit
Answer:[0m

[1m> Finished chain.[0m
 Streamlit's expander widget allows you to expand and collapse sections of your app. This is useful for hiding long sections of code or text that may not be relevant to the user at the moment. To use the expander widget, you can use the st.expander() function. This function takes two arguments: the title of the expander and a boolean value indicating whether the expander should be expanded or collapsed. You can also pass in an optional third argument to specify the initial state of the expander.


In [21]:
question = "What does this repository do?"
relevant_document = get_relevant_document(question=question,context_embeddings=context_embeddings, folder=context_folder)
output = chatgpt_chain.predict(document=relevant_document, question_input=question)
print(output)

Selected 3 file
./Docify-Lab-AIC//training/run_clm_flax.py
./Docify-Lab-AIC//training/run_encoder_decoder_flax.py
./Docify-Lab-AIC//utils/example.py


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThis assistant is capable of answering questions located in a designated folder. 
By analyzing the context of the question, it can identify the most relevant information and provide an accurate answer.
Question: What does this repository do?
Answer:[0m

[1m> Finished chain.[0m
 This repository is designed to assist developers with many different coding tasks. It leverages the latest AI technology to help developers save time and effort, allowing them to focus on creating the best possible code.


In [23]:
question = "give me the summary sentence for each file in the folder to know what it does "
relevant_document = get_relevant_document(question=question,context_embeddings=context_embeddings, folder=context_folder)
output = chatgpt_chain.predict(document=relevant_document, question_input=question)
print(output)

Selected 4 file
./Docify-Lab-AIC//training/run_encoder_decoder_flax.py
./Docify-Lab-AIC//streamlit_app.py
./Docify-Lab-AIC//main.py
./Docify-Lab-AIC//utils/example.py


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThis assistant is capable of answering questions located in a designated folder. 
By analyzing the context of the question, it can identify the most relevant information and provide an accurate answer.
Context: #!/usr/bin/env python # coding=utf-8 # Copyright 2021 The HuggingFace Team All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # #     http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 