In [1]:
# Import necessary libraries
import os
from google.colab import drive

In [None]:
# Install necessary packages
!pip install openai
!pip install langchain
!pip install pymupdf
!pip install chromadb


In [3]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.12-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.12 (from langchain-community)
  Downloading langchain-0.3.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain-community)
  Downloading langchain_core-0.3.25-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [4]:
!pip install langchain-chroma

Collecting langchain-chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Downloading langchain_chroma-0.1.4-py3-none-any.whl (10 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-0.1.4


In [5]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m40.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# File paths for the annual reports in Google Drive
files = {
    "2018": "./Walmart_Form_10-K_01-31-2018.pdf",
    "2023": "./walmart-inc-2023-annual-report.pdf"
}


In [None]:
# Import necessary modules after installation
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chat_models import ChatOpenAI

In [None]:
# Set OpenAI API Key
import openai
# os.environ["OPENAI_API_KEY"] = "XXX"

In [None]:
# Initialize OpenAI Chat LLM and embeddings
llm = ChatOpenAI(model="gpt-4", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")  # Specify embedding model

In [None]:
# Define text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # Reduce chunk size for smaller token input
    chunk_overlap=200,
    add_start_index=True
)

In [None]:
# Function to extract relevant sections
def extract_sections(file_path, section_keywords):
    loader = PyMuPDFLoader(file_path)
    pages = loader.load()
    relevant_texts = []
    for page in pages:
        content = page.page_content.lower()
        if any(keyword.lower() in content for keyword in section_keywords):
            relevant_texts.append(page.page_content)
    return "\n".join(relevant_texts)


In [None]:
# Extract MD&A and Risk Factors sections
sections = {
    year: {
        "MD&A": extract_sections(path, ["Management's Discussion and Analysis"]),
        "Risk Factors": extract_sections(path, ["Risk Factors"])
    }
    for year, path in files.items()
}

In [None]:
# Summarize sections to reduce size
def summarize_text(text, section_name, year):
    prompt = f"Summarize the following {section_name} section from the {year} annual report:\n\n{text}"
    summary = llm.predict(prompt)
    return summary

In [None]:
# Summarize MD&A and Risk Factors
summarized_sections = {
    year: {
        "MD&A": summarize_text(sections[year]["MD&A"], "MD&A", year),
        "Risk Factors": summarize_text(sections[year]["Risk Factors"], "Risk Factors", year),
    }
    for year in sections
}

In [None]:
# Create vector store
documents = []
for year in summarized_sections:
    documents.append(Document(page_content=summarized_sections[year]["MD&A"], metadata={"year": year, "type": "MD&A"}))
    documents.append(Document(page_content=summarized_sections[year]["Risk Factors"], metadata={"year": year, "type": "Risk Factors"}))

vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings)


In [None]:
# Define comparison template
COMPARISON_TEMPLATE = """
You are an assistant tasked with comparing two documents. Below are two sections extracted from annual reports of a Fortune 500 company from different years.

Section 1 (Year {year_1}):
{doc_1}

Section 2 (Year {year_2}):
{doc_2}

Please provide a detailed comparison of the two sections. Highlight:
1. Changes in business outlook and opportunities.
2. Changes in the risks identified by the firm.
3. Overall evolution of the firm’s priorities and strategies.
"""

comparison_prompt = ChatPromptTemplate.from_template(COMPARISON_TEMPLATE)

In [None]:
# QA chain setup
def create_qa_chain():
    return (
        {
            "doc_1": RunnablePassthrough(),
            "doc_2": RunnablePassthrough(),
            "year_1": RunnablePassthrough(),
            "year_2": RunnablePassthrough()
        }
        | comparison_prompt
        | llm
        | StrOutputParser()
    )

qa_chain = create_qa_chain()


In [None]:
# Perform comparison
for section in ["MD&A", "Risk Factors"]:
    question = {
        "doc_1": summarized_sections["2018"][section],
        "doc_2": summarized_sections["2023"][section],
        "year_1": "2018",
        "year_2": "2023"
    }
    print(f"Comparison for {section} section:\n")
    print(qa_chain.invoke(question))