In [1]:
# Import necessary modules
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, FewShotPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.utilities import SQLDatabase
from langchain_community.document_loaders import Docx2txtLoader, TextLoader
from langchain_community.vectorstores import Chroma, FAISS
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain.chains import ConversationalRetrievalChain
from langchain.agents.agent_types import AgentType
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import PyPDFLoader
from sqlalchemy import create_engine
import faiss
import numpy as np
import sqlite3
import os
import json
import re
import io

In [6]:
def initialize_models():
    # Initialize VertexAI models
    llm = VertexAI(model="gemini-pro", top_k=5, top_p=0.9, temperature=0.7, max_output_tokens=2048)
    embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@latest")
    return llm, embeddings

def get_examples(file_name):
    with open(file_name, 'r') as file:
        examples = json.load(file)
    return examples

def format_docs(docs):
    formatted_docs = "\n\n".join(f"{doc.page_content} (Source: {doc.metadata['source']})" for doc in docs)
    return formatted_docs

def contextualized_question(input):
    if input.get("chat_history"):
        return contextualize_q_chian
    else:
        return input['question']

def file_loader(folder_path):
    data = []
    for file in os.listdir(folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, file)
            loader = PyPDFLoader(pdf_path)
            data.extend(loader.load())
        elif file.endswith('.docx') or file.endswith('.doc'):
            doc_path = os.path.join(folder_path, file)
            loader = Docx2txtLoader(doc_path)
            data.extend(loader.load())
        elif file.endswith('.txt'):
            text_path = os.path.join(folder_path, file)
            loader = TextLoader(text_path)
            data.extend(loader.load())   
    return data

def split_documents(documents):
    # Split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=350)
    splits = text_splitter.split_documents(documents)
    return splits

# def connect_db(db_name):
#     # Connect to the SQLite database
#     conn = sqlite3.connect(db_name)
#     return conn.cursor()

# def fetch_data(cursor):
#     # Fetch all tables in the database
#     cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
#     tables = cursor.fetchall()

#     # Fetch the data from each table
#     data = {}
#     for table in tables:
#         table_name = table[0]
#         cursor.execute(f"SELECT * FROM {table_name}")
#         data[table_name] = cursor.fetchall()
    
#     return data

def create_vectorstore(splits, embeddings):
    # Create vectorstore
    vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
    return vectorstore

# def create_vectorstore(documents, embeddings):
#     # Convert each document into a vector
#     print([doc for doc in documents])
#     document_vectors = [embeddings.embed(doc) for doc in documents]
#     # Create a vector store
#     vs=FAISS.from_vectors(documents=document_vectors, embedding=embeddings)
#     return vs



def create_retriever(vectorstore):
    # Create retriever
    retriever = vectorstore.as_retriever()
    return retriever

def define_prompts():
    # Define prompts
    contextualize_q_system_prompt = "Given a chat history and the latest user question which might reference content in the chat history, formulate a standalone question which can be understood without the chat history. Do Not answer the question, just refromulate it if needed and otherwise return it as is."
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{question}"),
        ]
    )
    contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

    qa_system_prompt = "You are an assistant for question answering tasks. Use only the following pieces of retrieved context to answer the question. If the question is not related to the context then don't answer, just say that you are not sure about that. If you don't know the answer, just say that you are not sure about that in 1 or 2 lines and strictly dont exceed more than that. Question: {question} Context: {context} Answer:"
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", qa_system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{question}"),
        ]
    )
    return contextualize_q_chain, qa_prompt

def define_chain(contextualize_q_chain, qa_prompt, retriever):
    # Define chain
    rag_chain = (
        RunnablePassthrough.assign(context=contextualized_question | retriever | format_docs) | qa_prompt | llm
    )
    return rag_chain

In [None]:
# print(type(documents))
# Assuming `raw_documents` is your original list of documents
# documents = [Document(page_content=doc) for doc in raw_documents]
# print((f)[0])
# print([doc for doc in documents])

# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# tables = cursor.fetchall()
# print(tables)

In [None]:
# data = {}
# for table in tables:
#     print(table[0])
#     table_name = table[0]
#     cursor.execute(f"SELECT * FROM {table_name}")
#     data[table_name] = cursor.fetchall()
#     print(data[table_name])
#     break

In [7]:
# Load data from local data
folder_path = "/home/jupyter/vector_store/sharepoint_docs/Tech2Go"
documents = file_loader(folder_path)
# Split documents
splits = split_documents(documents)

# Initialize models
llm, embeddings = initialize_models()

# Connect to the SQLite database
# cursor = connect_db('/home/jupyter/vector_store/Vectordb/chroma.sqlite3')
# documents = fetch_data(cursor)
# f=[item for sublist in documents.values() for item in sublist]


# Create vectorstore
vectorstore = create_vectorstore(splits, embeddings)

# load_vectorstore
# vectorstore = Chroma.from_vectors(document_vectors)
# chroma_store_path= '/home/jupyter/vector_store/Vectordb/'
# vectorstore = Chroma.from_documents(documents=splits,embedding=embeddings,persist_directory=chroma_store_path)
# vectorstore = create_vectorstore(f, embeddings)


# Create retriever
retriever = create_retriever(vectorstore)

# Define prompts
contextualize_q_chain, qa_prompt = define_prompts()

# Define chain
rag_chain = define_chain(contextualize_q_chain, qa_prompt, retriever)





In [8]:
# Invoke chain
rag_chain.invoke({"chat_history": [], "question": "list me the features of tech2go?"})

"## Tech2go Features:\n\n* **Main Menu:** Provides access to various tools like My Profile, My Schedule, Notes, Feedback, Notifications, App Links, TechCLAD, Vehicle Update, Help, About, and Bluetooth.\n* **My Profile:** Technician's user ID, name, platform, email, location, and the ability to update their profile.\n* **My Schedule:** Technician's schedule for work and PTO.\n* **Notes:** Technicians can add personal notes for reminders.\n* **Feedback:** Option for technicians to leave suggestions and appreciations.\n* **Notifications:** Daily Hot Reads and alert notifications.\n* **App Links:** Links to saved apps.\n* **TechCLAD:** Technician Continuous Learning and Development tool.\n* **Vehicle Update:** Vehicle information and ID.\n* **Help:** Where to obtain help using Teams or Zoom.\n* **About:** Provides the latest version of Tech2go.\n* **Bluetooth:** Information about paired wireless devices.\n* **Tech Training:** Training information, # of courses, training monthly, and Hot Re