In [33]:
!pip install langchain_community
!pip install langchain_experimental
!pip install pypdf
!pip install cohere
!pip install chromadb
!pip install --upgrade langchain pydantic


Collecting pydantic
  Downloading pydantic-2.10.4-py3-none-any.whl.metadata (29 kB)
Collecting pydantic-core==2.27.2 (from pydantic)
  Downloading pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading pydantic-2.10.4-py3-none-any.whl (431 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.8/431.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydantic-core, pydantic
  Attempting uninstall: pydantic-core
    Found existing installation: pydantic_core 2.27.1
    Uninstalling pydantic_core-2.27.1:
      Successfully uninstalled pydantic_core-2.27.1
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.10.3
    Uninstalling pydantic-2.10.

In [43]:
from langchain.chains import RetrievalQA
from langchain.llms import Cohere
from langchain.vectorstores import Chroma
from langchain.embeddings import CohereEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [44]:
# Load PDF with PCA steps
pdf_loader = PyPDFLoader("pca_explained_steps.pdf")
documents = pdf_loader.load()

In [45]:
# Split PDF into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs_split = text_splitter.split_documents(documents)

In [46]:
import os

# Set Cohere API key
cohere_api_key = os.getenv("COHERE_API_KEY", "cohere-api-key")

# Cohere LLM and embeddings
embedding_function = CohereEmbeddings(model="embed-english-v2.0", cohere_api_key=cohere_api_key,user_agent="my-app")
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=cohere_api_key, temperature=0.0)

chroma_db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
chroma_db.add_documents(docs_split)

['e13c09f0-097c-432e-a54e-de9f356046a3']

In [47]:
# Create retriever and Cohere LLM
retriever = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
llm = Cohere(model="command-xlarge-nightly", temperature=0.0)

In [52]:
# RetrievalQA chain to extract PCA steps
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

In [53]:
# Define the expected PCA steps
expected_pca_steps = """
1. Standardize the data.
2. Calculate the covariance matrix.
3. Compute eigenvectors and eigenvalues.
4. Sort eigenvectors by descending eigenvalues.
5. Choose the top k eigenvectors as principal components.
6. Transform the data using the selected components.
"""

In [54]:
# Output parser schema
response_schemas = [
    ResponseSchema(name="correct_steps", description="List of correctly identified PCA steps."),
    ResponseSchema(name="incorrect_steps", description="List of incorrect or missing PCA steps."),
    ResponseSchema(name="suggested_fixes", description="Suggested corrections or additional steps needed."),
]

# Create the output parser
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [55]:
# Construct the prompt with the parser format instructions
verification_prompt_template = """
You have been provided with the retrieved steps for PCA:
{retrieved_answer}

Compare these steps with the expected PCA steps:
{expected_steps}

Provide your assessment of the retrieved steps:
- Correct steps
- Incorrect or missing steps
- Suggested fixes

Format your output as follows:
{format_instructions}
"""

prompt = PromptTemplate(
    input_variables=["retrieved_answer", "expected_steps"],
    template=verification_prompt_template,
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

# Perform retrieval and factual check
query = "List the steps involved in Principal Component Analysis (PCA)."
retrieved_answer = qa_chain.run(query)

# Format input for the LLM
verification_input = prompt.format(retrieved_answer=retrieved_answer, expected_steps=expected_pca_steps)

In [56]:
# Run LLM with output parser
parsed_output = llm(verification_input)
parsed_result = output_parser.parse(parsed_output)

# Display results
print("Correct Steps:", parsed_result["correct_steps"])
print("Incorrect Steps:", parsed_result["incorrect_steps"])
print("Suggested Fixes:", parsed_result["suggested_fixes"])

  parsed_output = llm(verification_input)


Correct Steps: 1. Standardize the data
2. Calculate the covariance matrix
Incorrect Steps: 3. Compute eigenvectors and eigenvalues (Missing). The retrieved step suggests choosing features instead, which is not a standard part of PCA.
4. Sort eigenvectors by descending eigenvalues (Incorrect). The retrieved step mentions sorting eigenvectors randomly, which is not the standard practice in PCA.
5. Choose the top k eigenvectors as principal components (Missing). Instead, the retrieved step mentions performing dimensionality reduction using principal components without specifying the selection of top components.
6. Transform the data using the selected components (Missing). The retrieved steps do not explicitly mention this transformation step.
Suggested Fixes: 3. Add the computation of eigenvectors and eigenvalues for the covariance matrix.
4. Correct the sorting step to sort eigenvectors based on descending eigenvalues, as this is the standard practice in PCA for identifying the most imp

In [None]:
#In case the output does not work
"""
try:
    print(output_parser.parse(parsed_output))
except Exception as e:
    print(f"Parsing failed: {e}")
    new_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
    print(new_parser.parse(parsed_output))
"""