In [1]:
import os
import sys
from dotenv import load_dotenv
from flask import Flask, request, render_template, jsonify, session
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

import pickle
import shutil
import json
import psutil

import helper_func.helper_func as helper_func


In [2]:
documents = []
# Iterate through files in the "docs" directory for processing
for file in os.listdir("docs"):
    # Load PDF files
    if file.endswith(".pdf"):
        pdf_path = "./docs/" + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    # Load Word documents
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = "./docs/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    # Load Txt documents
    # elif file.endswith('.txt'):
    #     text_path = "./docs/" + file
    #     loader = TextLoader(text_path)
    #     documents.extend(loader.load())


In [3]:
# Split text into chunks for processing
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
documents = text_splitter.split_documents(documents)

# Create and persist a vector database for document retrieval
vectordb = Chroma.from_documents(documents, embedding=OpenAIEmbeddings(), persist_directory="./data")
vectordb.persist()

# Set up conversational retrieval with OpenAI model
pdf_qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo'),
    retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True,
    verbose=False
)


In [48]:
user_question = "Is this person qualified for senior machine learning engineer?"

# Generate a prompt based on the user question
query = helper_func.set_prompt_template(user_question)
# shot += 1
# session['shot'] = shot
# else:
#     query = user_question

chat_history = []
# Invoke the conversational retrieval process
result = pdf_qa.invoke(
    {"question": query, "chat_history": chat_history})


Number of requested results 6 is greater than number of elements in index 4, updating n_results = 4


In [49]:
results = str(result['answer'])

In [51]:
print(type(results))

<class 'str'>


In [52]:
print(results)

1. Yes, this person is qualified for a senior machine learning engineer position based on their 5+ years of experience deploying machine learning models, conducting data analysis, and software development. They have expertise in cloud computing, data science, MLOps, LLMOps, and algorithm development. Additionally, their successful projects in developing real-time business intelligence dashboards, computer vision solutions, and machine learning models demonstrate their proficiency.

2. ['5+ years of experience in deploying ML models', 'Expertise in cloud computing and algorithm development', 'Successful projects in business intelligence and computer vision', 'Proficient in Python, SQL, and diverse ML frameworks']

3. Random question: What are the key skills and technologies mentioned in the person's profile that showcase their expertise in machine learning engineering?

4. test_answer: The key skills and technologies that showcase the person's expertise in machine learning engineering i

In [54]:
import ast

# Split the string by the numbers
sections = results.split('\n\n')

# Assign each section to a separate variable
answer = sections[0][3:].strip()  # Remove the numbering and leading/trailing whitespace
bullet_points = ast.literal_eval(sections[1][3:].strip())
test_question = sections[2][3:].strip()
test_answer = sections[3][12:].strip()  # Remove the "Test_answer: " prefix and leading/trailing whitespace

# Print the variables
print("Answer:", answer)
print("Bullet Points:", bullet_points)
print("Test Question:", test_question)
print("Test Answer:", test_answer)

Answer: Yes, this person is qualified for a senior machine learning engineer position based on their 5+ years of experience deploying machine learning models, conducting data analysis, and software development. They have expertise in cloud computing, data science, MLOps, LLMOps, and algorithm development. Additionally, their successful projects in developing real-time business intelligence dashboards, computer vision solutions, and machine learning models demonstrate their proficiency.
Bullet Points: ['5+ years of experience in deploying ML models', 'Expertise in cloud computing and algorithm development', 'Successful projects in business intelligence and computer vision', 'Proficient in Python, SQL, and diverse ML frameworks']
Test Question: Random question: What are the key skills and technologies mentioned in the person's profile that showcase their expertise in machine learning engineering?
Test Answer: er: The key skills and technologies that showcase the person's expertise in mac

In [55]:
type(bullet_points)

list

### Evaluate

In [79]:
# Retrieve the test answer from the session
user_answer = "FastAPI, Modelling, Deployment and Monitoring, CI/CD pipelines (GitHub Action), Docker Container, Appservice"

# Generate a prompt based on the user's answer and test answer
query = helper_func.set_eval_prompt_template(user_answer, test_answer)

chat_history = []
# Invoke the conversational retrieval process
result = pdf_qa.invoke(
    {"question": query, "chat_history": chat_history})


Number of requested results 6 is greater than number of elements in index 4, updating n_results = 4


In [80]:
result["answer"]

'True'

In [81]:
# Convert the answer from the result to a string
results = str(result["answer"])

In [82]:
print(results)

True


In [75]:
# Split the string by comma
values = results.split(", ")

# Convert knowledge_understood to bool
knowledge_understood = bool(values[0])

# Remove percentage sign and convert knowledge_confidence to int
knowledge_confidence = int(values[1].rstrip("%"))

print(knowledge_understood)  # Output: True
print(knowledge_confidence)  # Output: 90

True
90


In [78]:
type(knowledge_understood)

bool

In [83]:

try:
    # Split the string by comma
    if results.split(", "):
        values = results.split(", ")

        # Convert knowledge_understood to bool
        knowledge_understood = bool(values[0])

        # Remove percentage sign and convert knowledge_confidence to int
        knowledge_confidence = int(values[1].rstrip("%"))
    else:
        # Convert knowledge_understood to bool
        knowledge_understood = bool(results)

        knowledge_confidence = 0

except:
    # Assign default values
    knowledge_understood = False
    knowledge_confidence = 0

print(knowledge_understood)  # Output: True
print(knowledge_confidence)  # Output: 90


False
0
