In [1]:
import os 
import sys
from dotenv import load_dotenv, find_dotenv

from langchain.document_transformers import LongContextReorder

# Instantiate the chain
from langchain.chains import StuffDocumentsChain, LLMChain

# Build prompt
# Prompt template
from langchain.prompts import PromptTemplate

In [2]:
# Imports 
# Env var
import os 
import sys
from dotenv import load_dotenv, find_dotenv

In [3]:
# Env variable
sys.path.append('../..')
load_dotenv(find_dotenv())

True

In [4]:
DATA_PATH = './data/confluence/'
CONFLUENCE_SPACE_NAME = 'https://florianbastin.atlassian.net/wiki'
CONFLUENCE_API_KEY = os.environ['CONFLUENCE_PRIVATE_API_KEY'] # https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/
CONFLUENCE_SPACE_KEY = os.environ['CONFLUENCE_SPACE_KEY']  # Hint: space_key and page_id can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
CONFLUENCE_USERNAME = 'bastinflorian1@gmail.com' 

In [5]:
from langchain.document_loaders import ConfluenceLoader
loader = ConfluenceLoader(
    url=CONFLUENCE_SPACE_NAME,
    username=CONFLUENCE_USERNAME,
    api_key=CONFLUENCE_API_KEY
)

In [6]:
docs = loader.load(
    space_key=CONFLUENCE_SPACE_KEY,
    limit=10,
    # include_attachments=True, # uncomment to include png, jpeg, ..
    max_pages=50
)

In [7]:
from langchain.text_splitter import MarkdownHeaderTextSplitter


# Markdown 
headers_to_split_on = [
    ("#", "Titre"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Split based on markdown and add original metadata
md_docs = []
for doc in docs:
    md_doc = markdown_splitter.split_text(doc.page_content)
    for i in range(len(md_doc)):
        md_doc[i].metadata = md_doc[i].metadata | doc.metadata 
    md_docs.extend(md_doc)

# RecursiveTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Chunk size big enough
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

splitted_docs = splitter.split_documents(md_docs)

In [8]:
persist_directory = './db/chroma'

In [10]:
# Embeddings
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [11]:
# DB 
from langchain.vectorstores import Chroma
db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

In [12]:
db._collection.count()

108

In [13]:
# db.get()
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.3})

In [14]:
template = """Given this text extracts:
    -----
    {context}
    -----
    Please answer with to the following question:
    Question: {question}
    Answer: 
    """

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [15]:
# LLM
from langchain.llms import OpenAI  
llm = OpenAI()

In [22]:
question = "Comment faire une photo de profil Octo ? "
from langchain.chains import RetrievalQA

chain_type_kwargs = {"prompt": prompt}

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine", # or 
    retriever=retriever,
    return_source_documents=True,
    #chain_type_kwargs=chain_type_kwargs,
    verbose=True
)
query = {"query": question}
answer = qa(query)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [23]:
print(answer["result"])



Pour faire une photo de profil Octo, vous pouvez demander à un photographe professionnel de vous prendre en photo avec votre meilleur profil “I am a STAR”, ou utiliser un trépied ou une surface plate pour prendre une photo de vous-même. Ensuite, cadrez le buste entier et enregistrez la photo au format poly.png. Vous pouvez ensuite télécharger la photo sur le dossier de partage « OCTO'S PICS » sur la page Facebook [OctoTechnology](http://www.facebook.com/OctoTechnology). N'oubliez pas de retoucher la photo avant de la télécharger et de vous assurer que votre photo est liée à votre profil d'utilisateur sur tous les outils et services OCTO, tels que Confluence, Octopod, PeopleDoc, AskBob, Gmail + Google Agenda + Mailing list, et au catalogue de formation sur le site web LevelUp, afin de faciliter vot
