## Setting UP credentials

In [None]:
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

os.environ['OPENAI_API_KEY'] = 'OPENAI_API_KEY'
os.environ['ACTIVELOOP_TOKEN'] = 'ACTIVELOOP_TOKEN'
embeddings = OpenAIEmbeddings()

# Indexing the repository

## Cloning the repo

In [None]:
!git clone git@github.com:reservamos/vanda-admin.git

## Load repo files to memory

In [10]:
import os
from langchain.document_loaders import TextLoader

root_dir = './vanda-admin'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try: 
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e: 
            pass

print("Loaded {} documents".format(len(docs)))

Loaded 980 documents


## Chunk the files

In [11]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)


Created a chunk of size 1593, which is longer than the specified 1000
Created a chunk of size 1419, which is longer than the specified 1000
Created a chunk of size 1024, which is longer than the specified 1000
Created a chunk of size 1306, which is longer than the specified 1000
Created a chunk of size 1591, which is longer than the specified 1000
Created a chunk of size 1098, which is longer than the specified 1000
Created a chunk of size 2802, which is longer than the specified 1000
Created a chunk of size 1653, which is longer than the specified 1000
Created a chunk of size 1649, which is longer than the specified 1000
Created a chunk of size 2085, which is longer than the specified 1000
Created a chunk of size 1669, which is longer than the specified 1000
Created a chunk of size 2968, which is longer than the specified 1000
Created a chunk of size 2473, which is longer than the specified 1000
Created a chunk of size 2417, which is longer than the specified 1000
Created a chunk of s

In [12]:
deeplake_path = 'hub://brayan244/code-dataset'

db = DeepLake.from_documents(texts, embeddings, dataset_path=deeplake_path)

Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/brayan244/code-dataset


 

hub://brayan244/code-dataset loaded successfully.


Evaluating ingest: 100%|██████████| 3/3 [00:43<00:00
-

Dataset(path='hub://brayan244/code-dataset', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape       dtype  compression
  -------   -------    -------     -------  ------- 
 embedding  generic  (2112, 1536)  float32   None   
    ids      text     (2112, 1)      str     None   
 metadata    json     (2112, 1)      str     None   
   text      text     (2112, 1)      str     None   


 

In [None]:
## Load the vector store

In [13]:
deeplake_path = 'hub://brayan244/code-dataset'
db = DeepLake(dataset_path=deeplake_path, read_only=True, embedding_function=embeddings)

-

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/brayan244/code-dataset



\

hub://brayan244/code-dataset loaded successfully.



Deep Lake Dataset in hub://brayan244/code-dataset already exists, loading from the storage


Dataset(path='hub://brayan244/code-dataset', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape       dtype  compression
  -------   -------    -------     -------  ------- 
 embedding  generic  (2112, 1536)  float32   None   
    ids      text     (2112, 1)      str     None   
 metadata    json     (2112, 1)      str     None   
   text      text     (2112, 1)      str     None   


In [14]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 20

In [25]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model='gpt-3.5-turbo') # 'gpt-3.5-turbo',
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)

questions = [
    "give me all the database fields for a payment",
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: give me all the database fields for a payment 

**Answer**: Here are all the database fields for a payment:

From the `AddPayments` migration:
- `id: integer` - the ID of the payment (primary key)
- `purchase_id: integer` - the ID of the purchase associated with the payment
- `state: string` - the state of the payment
- `credit_card_token: string` - the token of the credit card used for the payment
- `amount: decimal` - the amount of the payment
- `complete_payment: boolean` - whether the payment has been completed or not
- `created_at: datetime` - the timestamp of when the payment was created
- `updated_at: datetime` - the timestamp of when the payment was last updated

From the `AddPayPalFieldsToPayment` migration:
- `paypal_email: string` - the email associated with the PayPal account used for the payment
- `paypal_verified: boolean` - whether the PayPal account used for the payment is verified or not
- `paypal_payer_id: string` - the ID of the PayPal account used f