In [2]:
import os
import mammoth
import numpy as np
# from docx import Document as DocxDocument
from langchain.docstore.document import Document
from langchain.docstore.base import Docstore
from openpyxl import load_workbook
# import PyPDF2
# import fitz
# import xlrd
import subprocess
from langchain.embeddings.vertexai import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import faiss
import pickle
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [3]:
def initialize_models():
    # Initialize VertexAI models
    llm = VertexAI(model="gemini-pro", top_k=5, top_p=0.9, temperature=0.7, max_output_tokens=2048)
    embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@latest")
    return llm, embeddings

In [4]:
llm, embeddings = initialize_models()

  warn_deprecated(
  warn_deprecated(


In [8]:
def create_and_save_faiss_index(documents, embeddings_model, combined_path='all_embeddings_and_texts.pkl', batch_size=30):
    all_texts = []
    all_embeddings = []

    for i in range(0, len(documents), batch_size):
        batch_docs = documents[i:i + batch_size]
        texts = [doc.page_content for doc in batch_docs]
        embeddings = embeddings_model.embed_documents(texts)
        
        all_texts.extend(texts)
        all_embeddings.extend(embeddings)
    
    # faiss_index = FAISS.from_embeddings(all_embeddings, all_texts)
    
    with open(combined_path, 'wb') as f:
        pickle.dump({'all_embeddings': all_embeddings, 'all_texts': all_texts}, f)

In [5]:
doc_path='documents_obj.pkl'
with open(doc_path,'rb') as f:
    documents=pickle.load(f)

In [6]:
len(documents)

68662

In [9]:
create_and_save_faiss_index(documents, embeddings)

In [5]:
doc_path='all_embeddings_and_texts.pkl'
with open(doc_path,'rb') as f:
    all=pickle.load(f)

In [6]:
emb=all['all_embeddings']

In [7]:
txt=all['all_texts']

In [8]:
rey=list(zip(txt,emb))

In [9]:
len(rey)

68662

In [10]:
faiss_index = FAISS.from_embeddings(rey,embeddings)

In [11]:
faiss_index.save_local('sharepoint')

In [12]:
def create_retriever(vectorstore):
    # Create retriever
    retriever = vectorstore.as_retriever()
    print('create_retriever')
    return retriever

def define_prompts():
    # Define prompts
    contextualize_q_system_prompt = "Given a chat history and the latest user question which might reference content in the chat history, formulate a standalone question which can be understood without the chat history. Do Not answer the question, just refromulate it if needed and otherwise return it as is."
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{question}"),
        ]
    )
    print('define_prompts')
    contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()
    qa_system_prompt = "You are an assistant for question answering tasks. Use only the following pieces of retrieved context to answer the question. If the question is not related to the context then don't answer, just say that you are not sure about that. If you don't know the answer, just say that you are not sure about that in 1 or 2 lines and strictly dont exceed more than that. Question: {question} Context: {context} Answer:"
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", qa_system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{question}"),
        ]
    )
    return contextualize_q_chain, qa_prompt

def define_chain(contextualize_q_chain, qa_prompt, retriever):
    # Define chain
    rag_chain = (
        RunnablePassthrough.assign(context=contextualized_question | retriever | format_docs) | qa_prompt | llm
    )
    print('define_chain')
    return rag_chain

def contextualized_question(input):
    
    if input.get("chat_history"):
        return contextualize_q_chain
    else:
        return input['question']
    
def format_docs(docs):
    formatted_docs = "\n\n".join(f"{doc.page_content} (Source: {doc.metadata['source']})" if 'source' in doc.metadata else f"{doc.page_content}" for doc in docs)
    return formatted_docs

In [13]:
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, FewShotPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel

In [14]:
# Create retriever
retriever = create_retriever(faiss_index)

# Define prompts
contextualize_q_chain, qa_prompt = define_prompts()

# Define chain
rag_chain = define_chain(contextualize_q_chain, qa_prompt, retriever)

create_retriever
define_prompts
define_chain


In [15]:
print(rag_chain.invoke({"chat_history":[],"question":"Tech2go is mobile application?"}))

 Assistant: Yes, Tech2go is a mobile application available for Android devices. It can be installed from the Play Store.


In [19]:
print(rag_chain.invoke({"chat_history":[],"question":"features of tech2go?"}))

 The provided text focuses on instructions and guidelines for using the Tech2go mobile application. It covers various aspects such as installation, settings, job management, and troubleshooting, but it doesn't specifically mention the features of Tech2go. Therefore, I cannot answer the question based on the given context.
