In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader, ReadTheDocsLoader, UnstructuredMarkdownLoader
import os


### Load Mardown Files

1- From Docs-components :

In [3]:
# Set the directory containing the DITA files
directory = "../Markdown_content/docs-components/"

loader = DirectoryLoader(directory, loader_cls=UnstructuredMarkdownLoader, show_progress=True)
docs_components = loader.load()

len(docs_components)


 99%|█████████▉| 5429/5484 [00:55<00:00, 98.42it/s] 


5429

2- From Docs-core :

we can load all the files without for loop !

In [3]:
directory = "../github-content/Markdown_content/docs-core"
loader = DirectoryLoader(directory, loader_cls=UnstructuredMarkdownLoader, show_progress=True)
docs_core = loader.load()

len(docs_core)

 99%|█████████▉| 5895/5925 [00:40<00:00, 145.91it/s]


5895

3- From docs_data_catalog

In [4]:
loader = DirectoryLoader("../Markdown_content/docs-data-catalog", loader_cls=UnstructuredMarkdownLoader, show_progress=True)
docs_catalog = loader.load()
len(docs_catalog)

100%|█████████▉| 404/406 [00:02<00:00, 172.76it/s]


404

Combine all three documents

In [121]:
documents = docs_components + docs_core + docs_catalog
len(documents)

11751

--------------------------------

### Text Splitter 

In [26]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

#create the length function
def titktoken_len(text):
    tokens = tokenizer.encode(
        text
        )
    return len(tokens)

In [115]:
import numpy as np

tokens_counts = [titktoken_len(doc.page_content) for doc in documents]

tokens_counts = np.array(tokens_counts)

print(f"""Min : {min(tokens_counts)}
Avg : {int(sum(tokens_counts) / len(tokens_counts))}
50% :{np.percentile(tokens_counts, 50)}
75% :{np.percentile(tokens_counts, 75)}
90% :{np.percentile(tokens_counts, 90)}
95% :{np.percentile(tokens_counts, 95)}
99% :{np.percentile(tokens_counts, 99)}
Max : {max(tokens_counts)}""")

Min : 5
Avg : 374
50% :214.0
75% :407.0
90% :840.0
95% :1308.0
99% :2339.5
Max : 16513


In [127]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

"""
    Implementation of splitting text that looks at characters. Recursively tries to split by different characters to find one that works.

"""

text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50, length_function=titktoken_len)
documents_splitted = text_splitter.split_documents(documents)

len(documents_splitted)

15273

In [126]:
from langchain.text_splitter import TokenTextSplitter

""" 
    Implementation of splitting text that looks at tokens
"""

text_splitter = TokenTextSplitter(encoding_name="cl100k_base", chunk_size=600, chunk_overlap=50)
documents_splitted2 = text_splitter.split_documents(documents)

len(documents_splitted2)

15296

### Create a Vectorstore

Embed and store documents using the vectorstore Chroma 

In [134]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'vectordb'

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

## here we are using OpenAI 
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")

vectordb = Chroma.from_documents(documents=documents_splitted,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

Persiste the db to disk

In [135]:
# persiste the db to disk
vectordb.persist()
vectordb = None

Now we can load the persisted database from disk, and use it as normal.

In [12]:
persist_directory = 'vectordb'
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

vectordb

<langchain.vectorstores.chroma.Chroma at 0x158ac1ff0a0>

In [10]:
import pandas as pd
chroma_embeddings = pd.read_parquet('./vectordb/chroma-embeddings.parquet')
chroma_collections = pd.read_parquet('./vectordb/chroma-collections.parquet')

In [13]:
chroma_embeddings.shape

(15273, 6)

In [14]:
chroma_embeddings.head()

Unnamed: 0,collection_uuid,uuid,embedding,document,id,metadata
0,a01a65d1-9561-4727-9bab-7e061a56b453,2e4243a6-96b3-4dd1-bb10-87b673cbc05b,"[-0.01480712855571336, 0.011938492184529149, -...",Creating a marketing plan {#creating-a-marketi...,8783888a-0524-11ee-b6e0-f21f2e9d130d,"{""source"": ""..\\github-content\\docs_component..."
1,a01a65d1-9561-4727-9bab-7e061a56b453,112b0b56-ec69-4f00-8430-02c845b29b12,"[-0.013559202306646156, 0.009591309101212146, ...",Creating a product {#creating-a-product .task}...,8783888b-0524-11ee-8808-f21f2e9d130d,"{""source"": ""..\\github-content\\docs_component..."
2,a01a65d1-9561-4727-9bab-7e061a56b453,e7fc43ae-5bfd-420f-8194-612b84023557,"[-0.0016467134787746098, -0.000263917843337432...",Creating a support plan {#creating-a-support-p...,8783888c-0524-11ee-9cbb-f21f2e9d130d,"{""source"": ""..\\github-content\\docs_component..."
3,a01a65d1-9561-4727-9bab-7e061a56b453,c389b9f9-29e4-4f05-9e4e-ea65a0f7b2b6,"[-0.008271590933255035, -0.005999722609935641,...",Creating a vendor {#creating-a-vendor .task}\n...,8783888d-0524-11ee-b3cc-f21f2e9d130d,"{""source"": ""..\\github-content\\docs_component..."
4,a01a65d1-9561-4727-9bab-7e061a56b453,79d6465a-2038-471c-8da4-ed75a63229a0,"[-0.015604484891350892, 0.016436031156722604, ...",Editing a marketing plan {#editing-a-marketing...,8783888e-0524-11ee-b655-f21f2e9d130d,"{""source"": ""..\\github-content\\docs_component..."


In [16]:
chroma_embeddings.collection_uuid.unique()

array(['a01a65d1-9561-4727-9bab-7e061a56b453'], dtype=object)

### Create the chroma Retrievier

In [33]:
# by default search_type = "similarity"
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [20]:
retriever.get_relevant_documents("where can I see the execution status of my tasks in Talend Cloud? ")

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..


[Document(page_content='Displaying real time statistics during remote execution {#displaying-real-time-statistics-during-remote-execution_t .task}\n\nFrom Talend Administration Center, you can monitor in real time the execution status and performance of your processes. This allows you to identify any bottleneck during the data processing and gives you a real-time visibility on the progress of your Jobs.\n\nTo display the Real time statistics window during remote execution, do the following:\n\nIn the Menu tree view, click Job Conductor to display the list of scheduled tasks for deploying and executing Jobs on remote servers.\n\nEnsure the task you want to execute and display the real time statistics related has the Statistics option enabled.\nFor more information about the activation of the statistics from Talend Administration Center, see How to activate Real Time Statistics.\n\nSelect it in the list of tasks and check its Status.\nIt can be Ready to deploy or Ready to run.\n\nDependi

In [21]:
retriever.get_relevant_documents("how to filter out rows with invalid emails in Talend Studio?")

[Document(page_content='Removing non-matching values {#removing-non-matching-values_t .task}\n\nThe email pattern used on the email column showed that some records do not respect the standard email format. You can generate a ready-to-use Job to recuperate the non-matching rows from the column.\n\nIn the Profiling perspective, click the Analysis Results tab at the bottom of the editor.\n\nIn the Pattern Matching results of the email column, right-click the chart bar or the numerical results and select Generate Job.\nThe Integration perspective opens showing the generated Job.\n\nThis Job uses the Extract Transform Load process to write in two separate output files the valid/invalid email rows that match/do not match the pattern.\n\nSave the Job and press F6 to execute it.\n\nThe valid and invalid rows of the email column are written in the defined output files.\n\nYou can replace the output files with different Talend components and recuperate the valid/invalid email rows and write them

In [24]:
# Maximum Marginal Relevance Retrieval

'''Maximal Marginal Relevance (MMR) aims to select the most relevant results for a given query while maximizing the diversity of the selected results.'''


retriever_mmr = vectordb.as_retriever(search_type="mmr" ,search_kwargs={"k": 5})
retriever_mmr.get_relevant_documents("where can I see the execution status of my tasks in Talend Cloud? ")

[Document(page_content='Displaying real time statistics during remote execution {#displaying-real-time-statistics-during-remote-execution_t .task}\n\nFrom Talend Administration Center, you can monitor in real time the execution status and performance of your processes. This allows you to identify any bottleneck during the data processing and gives you a real-time visibility on the progress of your Jobs.\n\nTo display the Real time statistics window during remote execution, do the following:\n\nIn the Menu tree view, click Job Conductor to display the list of scheduled tasks for deploying and executing Jobs on remote servers.\n\nEnsure the task you want to execute and display the real time statistics related has the Statistics option enabled.\nFor more information about the activation of the statistics from Talend Administration Center, see How to activate Real Time Statistics.\n\nSelect it in the list of tasks and check its Status.\nIt can be Ready to deploy or Ready to run.\n\nDependi

### RetrievalQA Chain

In [28]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=os.environ['OPENAI_API_KEY'],
    model_name='gpt-3.5-turbo'
    )

1- Example using a retriever with similarity as a search_type

In [34]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=False)

In [35]:
query = "where can I see the execution status of my tasks in Talend Cloud?"
qa_chain(query)

{'query': 'where can I see the execution status of my tasks in Talend Cloud?',
 'result': 'You can see the execution status of your tasks in Talend Cloud on the Job Conductor page. From there, you can monitor the execution status and performance of your processes in real time, identify any bottlenecks during data processing, and get real-time visibility on the progress of your Jobs. If you want to see the execution history of a specific task, you can select the task in the task list and click on the "Execution History" icon in the Actions column.'}

In [36]:
query = "how to filter out rows with invalid emails in Talend Studio?"
qa_chain(query)

{'query': 'how to filter out rows with invalid emails in Talend Studio?',
 'result': 'You can use the Profiling perspective in Talend Studio to filter out rows with invalid emails. Here are the steps:\n\n1. Open the analysis results for the email column in the Profiling perspective.\n\n2. Identify the rows with invalid email addresses either by looking at the pattern matching results or by using the quality bar.\n\n3. Right-click on the chart bar or the numerical results for the invalid email addresses and select "Generate Job".\n\n4. Talend Studio will generate a job that uses the Extract Transform Load process to write the valid/invalid email rows to separate output files.\n\n5. Save the job and execute it by pressing F6.\n\n6. The valid and invalid rows of the email column will be written to the defined output files.\n\nYou can then replace the output files with different Talend components and retrieve the valid/invalid email rows and write them to databases, for example.'}

2- Example using a retriever with Maximum Marginal Relevance Retrieval as a search_type

In [29]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever_mmr, 
                                  return_source_documents=False)

In [30]:
query = "where can I see the execution status of my tasks in Talend Cloud?"
qa_chain(query)

{'query': 'where can I see the execution status of my tasks in Talend Cloud?',
 'result': 'You can monitor the execution status and performance of your processes in real-time from Talend Administration Center in Talend Cloud. To access this feature, you can click on "Job Conductor" in the Menu tree view to display the list of scheduled tasks for deploying and executing Jobs on remote servers. Once you select the task you want to execute and display the real-time statistics related to, its status will be displayed, and you can click on the appropriate button (Deploy or Run) depending on its status. Once the Job begins to run, the Real-time statistics window will pop up where you can see statistical information in real-time as the checkpoints Job runs.'}

In [31]:
query = "how to filter out rows with invalid emails in Talend Studio?"
qa_chain(query)

{'query': 'how to filter out rows with invalid emails in Talend Studio?',
 'result': 'You can generate a Job to filter out rows with invalid email addresses in Talend Studio. Here are the steps:\n\n1. In the Profiling perspective, click the Analysis Results tab at the bottom of the editor.\n\n2. In the Pattern Matching results of the email column, right-click the chart bar or the numerical results and select Generate Job. The Integration perspective opens showing the generated Job.\n\n3. This Job uses the Extract Transform Load process to write in two separate output files the valid/invalid email rows that match/do not match the pattern.\n\n4. Save the Job and press F6 to execute it.\n\n5. The valid and invalid rows of the email column are written in the defined output files.\n\nYou can replace the output files with different Talend components and recuperate the valid/invalid email rows and write them in databases for example.'}