### Initialisation
###### This section constructs our Python environment for running Contract AI
###### It is necessary to run this code in order to use any proceeding sections

In [1]:
# Install handler libraries
try:
    !pip cache purge
    !pip install tiktoken
    !pip install pypdf
    !pip install alive-progress
    !pip install langchain-openai==0.0.6
    !pip install openai==1.0.0
    !pip install langchain==0.1.7
    !pip install pydantic==1.10.0
    !pip install azure-search-documents==11.4.0
    !pip install typing-extensions==4.2.0
    print("\nSuccessfully installed handler libraries")
except Exception as errorMessage:
    print("\nError installing handler libraries\n")
    print(errorMessage)

In [74]:
# Import handler libraries
try:
    import os
    import io
    import csv
    import json
    import numpy
    import tempfile
    import requests
    import openai
    import tiktoken
    import hashlib
    from alive_progress import alive_bar
    from datetime import datetime
    from azure.core.credentials import AzureKeyCredential
    from azure.storage.blob import BlobServiceClient
    from azure.search.documents import SearchClient
    from langchain_openai import AzureOpenAI
    from langchain_openai import AzureChatOpenAI
    from langchain_openai import AzureOpenAIEmbeddings
    from langchain_community.vectorstores.azuresearch import AzureSearch
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain.chains.summarize import load_summarize_chain
    from langchain.chains import ConversationalRetrievalChain
    from langchain.schema.document import Document
    from langchain.prompts import PromptTemplate
    from langchain.chains.llm import LLMChain
    from langchain.chains import RetrievalQA
    from azure.search.documents.indexes.models import (
        SearchFieldDataType,
        SearchableField,
        SearchField,
        SimpleField
        )
    from langchain.document_loaders import (
        DirectoryLoader,
        TextLoader,
        JSONLoader,
        CSVLoader,
        PyPDFLoader,
        Docx2txtLoader
        )
    print("Successfully imported handler libraries")
except Exception as errorMessage:
    print("Error importing handler libraries\n")
    print(errorMessage)

In [75]:
# Set our reference variables
openaiType = "azure"
openaiKey = "ec8bdbcae2794f0b86858b8f6674add1"
openaiBase = "https://contract-ai-oai.openai.azure.com/"
openaiVersion = "2023-12-01-preview"
openaiEmbeddingDeployment = "contract-ai-oai-embedding"
openaiEmbeddingModel = "text-embedding-ada-002"
openaiGptDeployment = "contract-ai-oai-gpt"
openaiGptModel = "gpt-4"

searchVersion = "2023-11-01"
searchName = "contract-ai-srch"
searchEndpoint = "https://contract-ai-srch.search.windows.net"
searchIndex = "contract-ai-srch-index"
searchCredential = "Ne5GdyRY5QWeGCN6RPEZHs2khjWlPHRhK4qeJUo67rAzSeDDhnDP"

adlsConnectionString = "DefaultEndpointsProtocol=https;AccountName=contractaist;AccountKey=fFa02I7n2sM2JPTCdrqenuEemYj+dE+krK4pK/xEmWp5ZV5Onvy/AL54IJCeDi7619AJKmFNpr2e+AStj7Zn3w==;EndpointSuffix=core.windows.net"
adlsContainerInput = "landing-zone"
adlsFolderFilesInput = "files/test-nuh-small/"
adlsFolderQuestionsInput = "questions/test-nuh/"
adlsContainerOutput = "processing-zone"
adlsFolderFilesOutput = "files/test-nuh/"
adlsFolderQuestionsOutput = "answers/test-nuh/answers.csv"

fileTypeMapping = {
    "*.txt": TextLoader,
    "*.json": JSONLoader,
    "*.csv": CSVLoader,
    "*.pdf": PyPDFLoader,
    "*.docx": Docx2txtLoader
    }

questionPrompt = PromptTemplate.from_template("""
    You are a professional assistant who will be reviewing contractual documents.
    Your response to the given question should be factual and as accurate as possible, solely based on the provided context.
    Your response should state the answer as concisely as possible and then concisely provide any surrounding context of where the answer can be found in the document.
    You may use synonyms of the words in the given question to help search for an answer.
    If you cannot answer the given question with certainty then please state that fact and end your response.
    The question you are being presented with is between the following set of exclamation points: !{question}!
    The context which you may use to respond to this question can be found after this colon:
    {context}
    """)

In [76]:
# Set our OpenAI variables
try:
    openai.api_type = openaiType
    openai.api_key = openaiKey
    openai.api_base = openaiBase
    openai.api_version = openaiVersion
    print("OpenAI environment variables initialised\n")
except Exception as errorMessage:
    print("Error intialising OpenAI environment variables\n")
    print(errorMessage + "\n")

# Create our embeddings model
try:
    embeddingsClient = AzureOpenAIEmbeddings(
        azure_endpoint = openaiBase,
        deployment = openaiEmbeddingDeployment,
        model = openaiEmbeddingModel,
        openai_api_key = openaiKey,
        openai_api_type = openaiType,
        openai_api_version = openaiVersion
        )
    print("Embeddings client initialised\n")
except Exception as errorMessage:
    print("Error intialising embeddings client\n")
    print(errorMessage + "\n")

# Create our search model
try:
    searchClient = SearchClient(
        endpoint = searchEndpoint,
        index_name = searchIndex,
        credential = AzureKeyCredential(searchCredential)
        )
    print("Search client initialised\n")
except Exception as errorMessage:
    print("Error intialising search client\n")
    print(errorMessage + "\n")

# Create our vector model
try:
    vectorClient = AzureSearch(
        azure_search_endpoint = searchEndpoint,
        azure_search_key = searchCredential,
        index_name = searchIndex,
        embedding_function = embeddingsClient.embed_query
        )
    print("Vector client initialised\n")
except Exception as errorMessage:
    print("Error intialising vector client\n")
    print(errorMessage + "\n")

# Create our llm chat model
try:
    llmClient = AzureChatOpenAI(
        azure_endpoint = openaiBase,
        deployment_name = openaiGptDeployment,
        openai_api_key = openaiKey,
        openai_api_type = openaiType,
        openai_api_version = openaiVersion
        )
    print("LLM client initialised\n")
except Exception as errorMessage:
    print("Error intialising LLM client\n")
    print(errorMessage + "\n")

# Create our retrieval chain model
try:
    qaClient = ConversationalRetrievalChain.from_llm(
        llm = llmClient,
        retriever = vectorClient.as_retriever(),
        condense_question_prompt = questionPrompt,
        return_source_documents = True,
        verbose = False
        )
    print("QA client initialised\n")
except Exception as errorMessage:
    print("Error intialising QA client\n")
    print(errorMessage + "\n")

# Create our text splitter
try:
    textSplitter = RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 200)
    print("Text splitter initialised")
except Exception as errorMessage:
    print("Error intialising text splitter\n")
    print(errorMessage)

In [77]:
# Test our connection to our model
try:
    embeddingsTest = embeddingsClient.embed_query("Ciao")
    if embeddingsTest:
        print("Successfully tested connection")
except Exception as errorMessage:
    print("Error testing connection\n")
    print(errorMessage)

### Index Rebuild
###### This section deletes and rebuilds the vector database index
###### It only needs running if documents need deleting and can otherwise be skipped

In [91]:
# Delete our existing index
try:
    apiUrl = f"https://{searchName}.search.windows.net/indexes/{searchIndex}?api-version={searchVersion}"
    apiHeaders = {
        "Content-Type": "application/json",
        "api-key": searchCredential
        }
    apiResponse = requests.delete(apiUrl, headers = apiHeaders)
    if apiResponse.status_code == 204:
        print("Successfully deleted index")
    else:
        print("Error deleting index")
        print(f"Status code: {apiResponse.status_code}")
        print(f"Response: {apiResponse.json()}")
except Exception as errorMessage:
    print("Error deleting index\n")
    print(errorMessage)

In [79]:
# Rebuild our new index
try:
    apiUrl = f"https://{searchName}.search.windows.net/indexes/{searchIndex}?api-version={searchVersion}"
    apiHeaders = {
        "Content-Type": "application/json",
        "api-key": searchCredential
        }
    indexSchema = {
        "name": searchIndex,
        "fields": [
            {"name": "id", "type": "Edm.String", "key": True, "searchable": True, "filterable": True, "sortable": True, "facetable": True},
            {"name": "content", "type": "Edm.String", "searchable": True, "filterable": True, "sortable": True, "facetable": True, "analyzer": "standard.lucene"},
            {"name": "content_vector", "type": "Collection(Edm.Single)", "searchable": True, "dimensions": 1536, "vectorSearchProfile": "vector-profile"},
            {"name": "chunk_id", "type": "Edm.Int64", "searchable": False, "filterable": True, "sortable": True, "facetable": True},
            {"name": "chunk_source", "type": "Edm.String", "searchable": True, "filterable": True, "sortable": True, "facetable": True, "analyzer": "standard.lucene"},
            {"name": "chunk_type", "type": "Edm.String", "searchable": True, "filterable": True, "sortable": True, "facetable": True, "analyzer": "standard.lucene"},
            {"name": "chunk_datetime", "type": "Edm.DateTimeOffset", "searchable": False, "filterable": True, "sortable": True, "facetable": True}
            ],
        "similarity": {"@odata.type": "#Microsoft.Azure.Search.BM25Similarity", "k1": None, "b": None},
        "vectorSearch": {
            "algorithms": [{"name": "vector-config", "kind": "hnsw", "hnswParameters": {"metric": "cosine", "m": 4, "efConstruction": 400, "efSearch": 500}}],
            "profiles": [{"name": "vector-profile", "algorithm": "vector-config"}]
            }
        }
    apiResponse = requests.put(apiUrl, headers = apiHeaders, json = indexSchema)
    if apiResponse.status_code == 201:
        print("Successfully rebuilt index")
    else:
        print("Error rebuilding index")
        print(f"Status code: {apiResponse.status_code}")
        print(f"Response: {apiResponse.json()}")
except Exception as errorMessage:
    print("Error rebuilding index\n")
    print(errorMessage)

In [80]:
# Recreate our vector model
try:
    vectorClient = AzureSearch(
        azure_search_endpoint = searchEndpoint,
        azure_search_key = searchCredential,
        index_name = searchIndex,
        embedding_function = embeddingsClient.embed_query
        )
    print("Vector client re-initialised\n")
except Exception as errorMessage:
    print("Error re-intialising vector client\n")
    print(errorMessage + "\n")

### Vectorisation
###### This section translates input documents into vectors
###### It only needs running to update the vector database and can otherwise be skipped

In [81]:
# Create our ADLS connection
try:
    blobClient = BlobServiceClient.from_connection_string(conn_str = adlsConnectionString)
    containerClient = blobClient.get_container_client(adlsContainerInput)
    print("Successfully connected to blob storage")
except Exception as errorMessage:
    print("Error connecting to blob storage\n")
    print(errorMessage)

# Define our file loading function
def load_files(folderName):
    blobData = []
    for blob in containerClient.list_blobs(name_starts_with = folderName):
        blobClient = containerClient.get_blob_client(blob.name)
        blobData.append([blob.name.split("/")[-1], io.BytesIO(blobClient.download_blob().readall())])
    return blobData

# Load our blob data to temp
try:
    blobData = load_files(adlsFolderFilesInput)
    tempPdfs = []
    tempDir = tempfile.TemporaryDirectory()
    for i, [filename, byte_content] in enumerate(blobData):
        filePath = os.path.join(tempDir.name, filename)
        with open(filePath, "wb") as file:
            file.write(byte_content.getbuffer())
        tempPdfs.append(filePath)
    if len(tempPdfs) > 0:
        print("\nSuccessfully loaded blobs")
except Exception as errorMessage:
    print("\nError loading blobs\n")
    print(errorMessage)

In [86]:
# Define our hashing function
def hash_text(text: str):
    shaHash = hashlib.sha256()
    shaHash.update(text.encode("utf-8"))
    hashedText = shaHash.hexdigest()
    return hashedText

# Vectorise our blobs
vectorStore = []
for pattern, loader in fileTypeMapping.items():
    loadDir = DirectoryLoader(tempDir.name, glob = pattern, loader_cls = loader)
    try:
        document = loadDir.load_and_split()
        if document:
            print(f"Successfully loaded {pattern[2:]} files:")
        documentList = set([sourceDoc.metadata["source"].split("/")[-1] for sourceDoc in document])
        for item in documentList:
            print(f" - {item}")
    except Exception as errorMessage:
        print(f"Error loading {pattern[2:]} files\n")
        print(errorMessage)
    if documentList:
        try:
            chunks = textSplitter.split_documents(document)
            if len(chunks) > 0:
                print(f"\nSuccessfully chunked {pattern[2:]} files")
        except Exception as errorMessage:
            print(f"\nError chunking {pattern[2:]} files\n")
            print(errorMessage)
        try:
            for i, chunk in enumerate(chunks):
                chunkFilePath = chunk.metadata["source"]
                vectorStore.append({
                    "id": hash_text(chunk.page_content),
                    "content": chunk.page_content,
                    "content_vector": embeddingsClient.embed_query(chunk.page_content),
                    "chunk_id": i,
                    "chunk_source": chunkFilePath.split("/")[-1],
                    "chunk_type": chunkFilePath.split(".")[-1],
                    "chunk_datetime": datetime.utcnow()
                    })
            print(f"\nSuccessfully vectorised {pattern[2:]} files")
        except Exception as errorMessage:
            print(f"\nError vectorising {pattern[2:]} files\n")
            print(errorMessage)
if len(vectorStore) > 0:
    print("\nSuccessfully chunked and vectorised all files")

In [83]:
# Upload our vector store list to cognitive services
try:
    for vectorItem in vectorStore:
        results = searchClient.upload_documents(documents = vectorItem)
    if results:
        print("Successfully indexed vector documents")
except Exception as errorMessage:
    print("Error indexing vector documents\n")
    print(errorMessage)

### Preset Question Chain
###### This section queries our provided questions against the documents in the vector database
###### Input questions should be provided in the blob stoage and output answers can be found alongside them

In [87]:
# Create our ADLS connection
try:
    blobClient = BlobServiceClient.from_connection_string(conn_str = adlsConnectionString)
    containerClient = blobClient.get_container_client(adlsContainerInput)
    print("Successfully connected to blob storage")
except Exception as errorMessage:
    print("Error connecting to blob storage\n")
    print(errorMessage)

# Define our file loading function
def load_questions(folderName):
    questionsInput = []
    for blob in containerClient.list_blobs(name_starts_with = folderName):
        blobClient = containerClient.get_blob_client(blob.name)
        questionsInput.append([blob.name.split("/")[-1], io.BytesIO(blobClient.download_blob().readall())])
    return questionsInput

# Load our questions
try:
    questionsInput = load_questions(adlsFolderQuestionsInput)
    tempQuestionsList = []
    tempQuestionsDir = tempfile.TemporaryDirectory()
    for i, [filename, byte_content] in enumerate(questionsInput):
        filePath = os.path.join(tempQuestionsDir.name, filename)
        with open(filePath, "wb") as file:
            file.write(byte_content.getbuffer())
        tempQuestionsList.append(filePath)
    questionsList = []
    with open(tempQuestionsList[0], "r") as file:
        for line in file:
            question = line.strip()
            questionsList.append(question)
        file.close()
    print("\nSuccessfully loaded questions")
except Exception as errorMessage:
    print("\nError loading questions\n")
    print(errorMessage)

# Load our file list
try:
    filesList = []
    for file in tempPdfs:
        filesList.append(file.split("/")[-1])
    print("\nSuccessfully loaded file list")
except Exception as errorMessage:
    print("\nError loading file list\n")
    print(errorMessage)

In [88]:
# Define our question response function
def question_response_chain(question, fileName):
    qaChain = RetrievalQA.from_chain_type(
        llmClient,
        retriever = vectorClient.as_retriever(
            search_kwargs = {
                "k": 10,
                "fetch_k": 10,
                "filter": {"chunk_source": f"{fileName}"}
            }
        ),
        return_source_documents = True,
        chain_type_kwargs = {"prompt": questionPrompt}
    )
    botMessage = qaChain({"query": question})
    answer = botMessage["result"]
    textSource = botMessage["source_documents"]
    return answer, textSource

In [89]:
# Run question response chain
data = []
print(f"Querying files:")
for item in filesList:
    print(f" - {item}")
for file in filesList:
    try:
        print(f"\nQuerying file: {file}")
        answersRow = [file]
        with alive_bar(len(questionsList)) as bar:
            for question in questionsList:
                answer, textSource = question_response_chain(question, file)
                answersRow.append(answer)
                bar()
        data.append(answersRow)
    except Exception as errorMessage:
        print(f"\nError querying file: {file}\n")
        print(errorMessage)
if len(data) > 0:
    print("\nSuccessfully queried all files")

In [90]:
# Create our ADLS connection
try:
    blobServiceClient = BlobServiceClient.from_connection_string(conn_str = adlsConnectionString)
    blobClient = blobServiceClient.get_blob_client(container = adlsContainerOutput, blob = adlsFolderQuestionsOutput)
    print("Successfully connected to blob storage")
except Exception as errorMessage:
    print("Error connecting to blob storage\n")
    print(errorMessage)

# Output results into blob storage
try:
    headers = ["File"] + questionsList
    with tempfile.NamedTemporaryFile(mode= "w+", newline= "", encoding= "utf-8", delete = False) as tempResults:
        writer = csv.writer(tempResults)
        writer.writerow(headers)
        writer.writerows(data)
        tempResultsPath = tempResults.name
    with open(tempResultsPath, "rb") as results:
        blobClient.upload_blob(results, overwrite = True)
    print("\nSuccessfully saved results")
except Exception as errorMessage:
    print("\nError saving results\n")
    print(errorMessage)

### Conversational Chain
###### This section can be used to demonstrate the question/answer capability of Contract AI
###### To make a query, edit line 3 with your question and then press the play button to the left of the cell

In [43]:
# Query our model for response
chatHistory = []
query = "What is the average lengths of the contracts?"
try:
    result = qaClient({"question": query, "chat_history": chatHistory})
    print(f"Question: {query}")
    print(f"Answer: {result['answer']}")
except Exception as errorMessage:
    print("Error querying\n")
    print(errorMessage)

In [44]:
# Alternate querying method
query = "What is the average lengths of the contracts?"
try:
    result = vectorClient.hybrid_search(query = query, k = 3)
    print(f"Question: {query}")
    print(f"Answer: {result[0].page_content}")
except Exception as errorMessage:
    print("Error querying\n")
    print(errorMessage)

### Keyword Search Chain
###### This section contains experimental upgrades to Contract AI functionality and is not yet implemented

In [45]:
# Define our keyword search function
def keyword_search(keyword):
    chunkSources = set()
    chunkSummariesProcessing = {}
    chunkSummaries = []
    chunkContentScores = []
    searchQuery = f"Content: '{keyword}'"
    results = searchClient.search(searchQuery)
    count = 0
    for i in searchClient.search(searchQuery):
        count += 1
    with alive_bar(count) as bar:
        for result in results:
            chunkSource = result["chunk_source"]
            chunkContent = result["content"]
            chunkScore = result["@search.score"]
            chunkSources.add(chunkSource)
            chunkContentScores.append({chunkSource: [chunkContent, chunkScore]})
            if chunkSource in chunkSummariesProcessing:
                chunkSummariesProcessing[chunkSource].append(chunkContent)
            else:
                chunkSummariesProcessing[chunkSource] = [chunkContent]
            bar()
    chunkSummaries = [
        Document(page_content = " ".join([chunk for chunk in chunkSummariesProcessing[key]]), metadata = {"source": key})
        for key in chunkSummariesProcessing.keys()
    ]
    return chunkSources, chunkContentScores, chunkSummaries

In [46]:
# Provide our chunk summarisation prompt
summarisationPromptTemplate = PromptTemplate.from_template("""
    You are an assistant and your task is to provide a complete and comprehensive summary.
    The summary has to include all details and it has to be very clear.
    The summary should be easy to understand for someone who has not read the original document.
    To provide a better understanding of the topic, please pay close attention to numerical data such as dates, numbers, and statistics.
    If the context is short, or if the information provided is duplicative, then please be concise with your response.
    Use no more than five sentences in your response, using as few as possible based on your confidence in the response provided.
    As a good assistant, memorise all these instructions and provide the summary of the following text after the colon:
    {chunkText}
""")

# Define our summarisation client function
def summarisation_client(promptTemplate, llmClient, chunkText):
    llmChain = LLMChain(llm = llmClient, prompt = promptTemplate)
    result = llmChain(chunkText)["text"]
    return result

# Define our search result summarisation function
def keyword_search_summarisation(chunkSummaries):
    results = {}
    with alive_bar(len(chunkSummaries))as bar:
        for chunk in chunkSummaries:
            chunkSource = chunk.metadata["source"]
            if chunkSource in results:
                results[chunkSource].append(summarisation_client(summarisationPromptTemplate, llmClient, chunk.page_content))
            else:
                results[chunkSource] = summarisation_client(summarisationPromptTemplate, llmClient, chunk.page_content)
            print(len(results[chunkSource]))
            bar()
    return results

In [47]:
# Define our search score filtering function
def keyword_search_filter(chunkContentScores, narrowRangeThreshold = 0.5, largeChunkThreshold = 100, topPercentile = 10):
    chunkContentScoresSorted = sorted(chunkContentScores, key = lambda i: list(i.values())[0][1], reverse = True)
    chunkContentScoresSortedProcessing = {
        source: [item[source] for item in chunkContentScoresSorted if source in item]
        for source in {list(item.keys())[0] for item in chunkContentScoresSorted}
    }
    chunkSourceScores = {source: [chunkDetail[1] for chunkDetail in chunkDetails] for source, chunkDetails in chunkContentScoresSortedProcessing.items()}
    chunkSourceScoresFiltered = {}
    with alive_bar(len(chunkSourceScores)) as bar:
        for source, scores in chunkSourceScores.items():
            scoreMean = numpy.mean(scores)
            scoreMedian = numpy.median(scores)
            scoreRange = max(scores) - min(scores)
            print(f"{source}  -  Mean = {scoreMean}, Median = {scoreMedian}, Range = {scoreRange}")
            if scoreRange <= narrowRangeThreshold:
                chunkSourceScoresFiltered[source] = [score for score in scores]
            elif len(scores) > largeChunkThreshold:
                scoreThreshold = numpy.percentile(scores, 100 - topPercentile)
                chunkSourceScoresFiltered[source] = [score for score in scores if score > scoreThreshold]
            else:
                scoreThreshold = scoreMean
                chunkSourceScoresFiltered[source] = [score for score in scores if score > scoreThreshold]
            bar()
    return chunkSourceScoresFiltered

In [48]:
# Test the output of a keyword search
keyword = "What is the average lengths of the contracts?"
try:
    chunkSources, chunkContentScores, chunkSummaries = keyword_search(keyword)
    chunkSourceScoresFiltered = keyword_search_filter(chunkContentScores)
except Exception as errorMessage:
    print(errorMessage)

In [50]:
# Test the output of the keyword search summarisation
try:
    result = keyword_search_summarisation(chunkSummaries)
    print(result)
except Exception as errorMessage:
    print(errorMessage)