In [87]:
## Import libraries
import os
import json
import pandas as pd
import traceback
from typing import Union
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from dotenv import load_dotenv

In [88]:
## Read variables from from env file
load_dotenv()  
OPENAI_KEY=os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_INDEX=os.getenv("PINECONE_INDEX_NAME")

### Step 1: Read Source Data Files, Data Cleaning, Splitting to chunks, Convert and Store it in Vector Database


Data Source 1: Interface Functional Specification Document (PDF)

Data Source 2: Interface Data Flow (PDF)

Data Source 3: Production Support Issues & Resolutions (CSV)

Data Source 4: Interface Mapping Sheet (XLSX)

Data Source 5: Interface Architecture, Failure Modes & Error Handling Mechanism (PPTX)

In [89]:
## Folder Path for Data Source Files
SOURCE_FILES_PATH="C:\\Users\\ASHOKKUMAR KALIAPPAN\\Documents\\Ashok\\MSc_DataAnalytics\\Final_Project\\Doc\\"

In [90]:
## Function for PDF extraction
def read_data_from_pdf(FILE_PATH,EXTRACT_IMAGE_INPUT):
    loader_dataflow = PyPDFLoader(FILE_PATH, extract_images=EXTRACT_IMAGE_INPUT)
    pages_dataflow = loader_dataflow.load()
    return pages_dataflow

In [91]:
## Read Data Source 1: Interface Functional Specification Document (PDF)
extracted_data_fsd = read_data_from_pdf (SOURCE_FILES_PATH + "Engineering_Datahub_Interface_FSD.pdf",False)

In [92]:
## Read Data Source 2: Interface Data Flow (PDF)
extracted_data_interfaceflow = read_data_from_pdf (SOURCE_FILES_PATH + "Engineering_Datahub_Interface_ProcessFlow.pdf",True)
## print(extracted_data_interfaceflow[0].page_content)

In [None]:
## Read Data Source 3: Production Support Issues & Resolutions (CSV)


### Step 2: Data Cleansing

In [57]:
def clean_pdf():
    ## lammatization, remove stop words and special chars (regex)
    
    return 

### Step 3: Split Data into Chunks

In [94]:
## Function to split document into chunks
def chunk_data(docs,CHUNK_SIZE,CHUNK_OVERLAP):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,chunk_overlap=CHUNK_OVERLAP)
    doc=text_splitter.split_documents(docs)
    return doc

In [95]:
documents=chunk_data(extracted_data_fsd,500,50)
len(documents)

107

### Step 4: Embeddings: Convert Chunks to Vectors and Store in Vector DB

In [96]:
## Embedding using OpenAI - model='text-embedding-ada-002'
embeddings = OpenAIEmbeddings(api_key=OPENAI_KEY)
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000020090138550>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000020090C02940>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [97]:
## Get the vector size / diemnsion of the embedding
vector_size = embeddings.embed_query("This is a test sentence.")

len(vector_size)

1536

In [98]:
## Pinecone VectorDB initiation, prior to this created pinecone with dimension = 1536
pc = Pinecone(api_key="024deca9-0a0a-4d33-b55d-8065e3e304c5")
index=pc.Index("supportassist")

In [99]:
pinecone_index = PineconeVectorStore.from_documents(documents, embeddings, index_name="supportassist")

### Step 5: RAG Implementation - Leverage Open AI Model, take input query from user & retrieve similar vectors from Vector DB, pass query + similar vectors to get the response

In [100]:
# Prompt Template

prompt=ChatPromptTemplate.from_messages(
    [
        ("system","You are a helpful support assistant. Please respond to the IT application support executive queries"),
        ("user","Question:{question}")
    ]
)

In [101]:
## Cosine Similarity Retreive Results from Pinecone VectorDB
def retrieve_query(query,k=5):
    matching_results=pinecone_index.similarity_search(query,k=k)
    return matching_results

In [102]:
## OpenAI Model - gpt-3.5-turbo and chain creation
llm=ChatOpenAI(openai_api_key=OPENAI_KEY,model_name="gpt-3.5-turbo", temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")
#question_answer_chain = create_stuff_documents_chain(llm, prompt)
#rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [105]:
## Search answers for the user query
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [106]:
our_query = "What is Engineering Datahub Interface Trigger?"
answer = retrieve_answers(our_query)
print(answer)

The Engineering Datahub Interface Trigger is the Change Notice Audit Task Completion.


In [107]:
our_query = "What are the data elements extracted from Change Notice?"
answer = retrieve_answers(our_query)
print(answer)

The data elements extracted from Change Notice are:

1. Part
2. Assembly
3. Part / Assembly Drawing
4. Engineering Specification Document


In [108]:
our_query = "What are the pre-validations of Engineering Datahub Interface?"
answer = retrieve_answers(our_query)
print(answer)

The pre-validations of the Engineering Datahub Interface include CA Validations and Engineering Datahub interface pre-validations.


### Step 6: Chat UI enablement through streamlit