In [25]:
import os
import sys
from dotenv import load_dotenv
from flask import Flask, request, render_template, jsonify, session
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

import pickle
import shutil
import json
import psutil

import helper_func.helper_func as helper_func

# Load environment variables from a .env file
load_dotenv('.env')


True

In [26]:
documents = []
# Iterate through files in the "docs" directory for processing
for file in os.listdir("docs"):
    # Load PDF files
    if file.endswith(".pdf"):
        pdf_path = "./docs/" + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    # Load Word documents
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = "./docs/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    # Load Txt documents
    # elif file.endswith('.txt'):
    #     text_path = "./docs/" + file
    #     loader = TextLoader(text_path)
    #     documents.extend(loader.load())


In [27]:
# Split text into chunks for processing
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
documents = text_splitter.split_documents(documents)

# Create and persist a vector database for document retrieval
vectordb = Chroma.from_documents(documents, embedding=OpenAIEmbeddings(), persist_directory="./data")
vectordb.persist()

# Set up conversational retrieval with OpenAI model
pdf_qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo'),
    retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True,
    verbose=False
)


In [28]:
user_question = "Is this person qualified for senior machine learning engineer?"

# Generate a prompt based on the user question
query = helper_func.set_prompt_template(user_question)
# shot += 1
# session['shot'] = shot
# else:
#     query = user_question

chat_history = []
# Invoke the conversational retrieval process
result = pdf_qa.invoke(
    {"question": query, "chat_history": chat_history})


In [29]:
results = str(result['answer'])

In [30]:
print(type(results))

<class 'str'>


In [31]:
print(results)

I don't have enough information to generate answers for your specific questions.


In [32]:
import re

In [None]:
import ast
import re

# Split the string by the numbers
# sections = results.split('\n\n')

sections = re.split(r'\b(?:1\.|2\.|3\.|4\.)\s*', results)

# Remove empty parts
sections = [part.strip() for part in sections if part.strip()]

# Assign each section to a separate variable
answer = sections[0].strip()  # Remove the numbering and leading/trailing whitespace
bullet_points = ast.literal_eval(sections[1].strip())
test_question = sections[2].strip()
test_answer = sections[3].strip()  # Remove the "Test_answer: " prefix and leading/trailing whitespace

# Print the variables
print("Answer:", answer)
print("Bullet Points:", bullet_points)
print("Test Question:", test_question)
print("Test Answer:", test_answer)

# Print the parts
print(sections)

In [29]:
type(sections)

list

### Evaluate

In [None]:
# Retrieve the test answer from the session
user_answer = "built chekintel, lead ml engineer at Chekkit, 5+ working experience"

# Generate a prompt based on the user's answer and test answer
query = helper_func.set_eval_prompt_template(user_answer, test_answer)

chat_history = []
# Invoke the conversational retrieval process
result = pdf_qa.invoke(
    {"question": query, "chat_history": chat_history})


In [33]:
result["answer"]

'1. False\n2. 85%'

In [34]:
# Convert the answer from the result to a string
results = str(result["answer"])

In [35]:
print(results)

1. False
2. 85%


In [36]:
sections = re.split(r'\b(?:1\.|2\.|3\.|4\.)\s*', results)

# Remove empty parts
sections = [part.strip() for part in sections if part.strip()]

# Assign each section to a separate variable
knowledge_understood = bool(sections[0].strip())  # Remove the numbering and leading/trailing whitespace
knowledge_confidence = sections[1].strip()  # prefix and leading/trailing whitespace

In [37]:
# # Split the string by comma
# values = results.split(", ")

# # Convert knowledge_understood to bool
# knowledge_understood = bool(values[0])

# # Remove percentage sign and convert knowledge_confidence to int
# knowledge_confidence = int(values[1].rstrip("%"))

print(knowledge_understood)  # Output: True
print(knowledge_confidence)  # Output: 90

True
85%


In [78]:
type(knowledge_understood)

bool

In [85]:
bool(results)

True

In [89]:
values = results.split(", ")

In [90]:
values

['True']

In [88]:
results.split(", ")

['True']

In [91]:

try:
    # Split the string by comma
    if (results.split(", ")) == 2:
        values = results.split(", ")

        # Convert knowledge_understood to bool
        knowledge_understood = bool(values[0])

        # Remove percentage sign and convert knowledge_confidence to int
        knowledge_confidence = int(values[1].rstrip("%"))
    else:
        # Convert knowledge_understood to bool
        knowledge_understood = bool(results)

        knowledge_confidence = 0

except:
    # Assign default values
    knowledge_understood = False
    knowledge_confidence = 0

print(knowledge_understood)  # Output: True
print(knowledge_confidence)  # Output: 90


True
0


## Multiple Query

In [4]:
# Build a sample vectorDB
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load blog post
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# VectorDB
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

### Single use

In [12]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

question = "What are the approaches to Task Decomposition?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)

ImportError: cannot import name 'convert_to_messages' from 'langchain_core.messages' (c:\Users\Arinz\anaconda3\envs\nurovant\lib\site-packages\langchain_core\messages\__init__.py)

In [13]:
from typing import List

from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field


# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)
llm = ChatOpenAI(temperature=0)

# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

# Other inputs
question = "What are the approaches to Task Decomposition?"

ImportError: cannot import name 'convert_to_messages' from 'langchain_core.messages' (c:\Users\Arinz\anaconda3\envs\nurovant\lib\site-packages\langchain_core\messages\__init__.py)

In [1]:
from flask_sqlalchemy import SQLAlchemy