In [11]:
# ignore if not preprocessing files

import os
import pdfplumber

def extract_text_and_tables_from_pdf(pdf_path):
    text = ""
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
            page_tables = page.extract_tables()
            if page_tables:
                tables.extend(page_tables)
    return text, tables

def save_text_and_tables_to_file(text, tables, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(text)
        if tables:
            f.write("\n\nTables:\n")
            for table in tables:
                for row in table:
                    f.write("\t".join(map(str, row)) + "\n")
                f.write("\n")

def convert_pdfs_to_text_files(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_dir, filename)
            text, tables = extract_text_and_tables_from_pdf(pdf_path)
            output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + '.txt')
            save_text_and_tables_to_file(text, tables, output_file)
            print(f"Converted {filename} to {output_file}")

# Directory containing PDFs
input_dir = "/home/dolphin/llm_projects/prac/10k_heinz"
# Directory to save text files
output_dir = "/home/dolphin/llm_projects/prac/test_folder_out"

convert_pdfs_to_text_files(input_dir, output_dir)

Converted heinz_2016.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2016.txt
Converted heinz_2015.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2015.txt
Converted heinz_2021.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2021.txt
Converted heinz_2017.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2017.txt
Converted heinz_2019.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2019.txt
Converted heinz_2023.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2023.txt
Converted heinz_2022.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2022.txt
Converted heinz_2018.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2018.txt
Converted heinz_2020.pdf to /home/dolphin/llm_projects/prac/test_folder_out/heinz_2020.txt


In [29]:
import os

import pandas as pd
import tiktoken

from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch
from dotenv import load_dotenv

In [None]:
%pip install graphrag
%python -m graphrag.index --init --root ./ragtest
# then drag data folder into output

In [30]:
"""api_key = os.environ["GRAPHRAG_API_KEY"]"""
# set the key directly below. can you a non openai model if the endpoint uses similar
#api_key = "ollama" #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"
api_key = "api_key"
#llm_model = "gemma2:9b-instruct-q8_0"
llm_model = "gpt-3.5-turbo"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

In [75]:
# parquet files generated from indexing pipeline
INPUT_DIR = "/prac/ragtest/output/20240712-021432/artifacts" # point to the folder containing artifacts
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

# ^ change community level if answers are trash

In [76]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Report records: 314


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,317,# Kraft Heinz Company and its Affiliates\n\nTh...,3,8.0,Kraft Heinz Company and its Affiliates,The Kraft Heinz Company's extensive network of...,The Kraft Heinz Company is a multinational foo...,[{'explanation': 'The Kraft Heinz Company demo...,"{\n ""title"": ""Kraft Heinz Company and its A...",d3cab7d2-a82a-4063-a1ae-3f247eb756f0
1,318,# SEC Oversight of The Kraft Heinz Company\n\n...,3,7.0,SEC Oversight of The Kraft Heinz Company,The SEC's oversight of public companies like T...,This community examines the relationship betwe...,[{'explanation': 'The Securities and Exchange ...,"{\n ""title"": ""SEC Oversight of The Kraft He...",03c9e9ca-39ae-4385-aab3-1a91b178c816
2,319,# Kraft Heinz Merger Community\n\nThe Kraft He...,3,7.5,Kraft Heinz Merger Community,The Kraft Heinz merger represents a significan...,The Kraft Heinz community centers around the 2...,[{'explanation': 'The 2015 merger of Kraft Foo...,"{\n ""title"": ""Kraft Heinz Merger Community""...",91f91355-6184-454e-b929-9921183b8a99
3,320,# Kraft Heinz Merger Community\n\nThe Kraft He...,3,8.0,Kraft Heinz Merger Community,The Kraft Heinz merger represents a significan...,The Kraft Heinz community centers around the 2...,[{'explanation': 'The data reveals that Kraft ...,"{\n ""title"": ""Kraft Heinz Merger Community""...",05627823-4567-4d95-b4df-f4d09ffa2ed9
4,321,# Kraft Heinz APAC Leadership\n\nThis communit...,3,6.0,Kraft Heinz APAC Leadership,The community demonstrates a clear leadership ...,This community centers around Rodrigo Wickbold...,[{'explanation': 'Rodrigo Wickbold is identifi...,"{\n ""title"": ""Kraft Heinz APAC Leadership"",...",b42b7b3e-45c2-42a5-aa8c-537ce117406d


In [77]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

In [78]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 8_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 4000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [79]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # 12_000 originally. change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=False,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [93]:
result = await search_engine.asearch(
    "Is there a strong correlation between sales growth and capital expenditure growth over the last 3 years and compared to competitors? (this can be changed depending on the metrics that are being used – e.g., could swap sales and CAPEX for two other metrics)."
)

print(result.response)

### Analysis of Sales Growth and Capital Expenditure Growth Correlation

Multiple analysts have delved into the relationship between sales growth and capital expenditure (CAPEX) growth within the context of Kraft Heinz's financial performance over the last three years and in comparison to competitors.

#### Sales Growth Importance:
Analysts have highlighted the significance of sales growth as a key metric influencing a company's financial health. The success of Kraft Heinz and its competitors is intricately tied to consumer preferences and product acceptance. Consumer preferences play a pivotal role in driving sales, with a direct link between satisfying evolving tastes and maintaining positive sales figures [Data: Reports (146)]. Moreover, net sales figures for companies like Mondelez International indicate strong performance in key segments, showcasing a correlation between sales growth and strategic focus on high-performing areas [Data: Reports (227)].

#### Capital Expenditure Grow

In [72]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

Unnamed: 0,id,title,occurrence weight,content,rank
0,205,Employer-Executive Relationships,0.095977,# Employer-Executive Relationships\n\nThis com...,7.5
1,75,Newell Brands' Rubbermaid Product Portfolio,0.058621,# Newell Brands' Rubbermaid Product Portfolio\...,7.0
2,158,Newell Brands Portfolio Analysis,0.054598,# Newell Brands Portfolio Analysis\n\nThis rep...,7.5
3,228,Commercial Products Performance Analysis,0.052874,# Commercial Products Performance Analysis\n\n...,7.0
4,188,Newell Brands' Kitchen Product Ecosystem,0.048276,# Newell Brands' Kitchen Product Ecosystem\n\n...,7.0
...,...,...,...,...,...
172,169,Juan R. Figuereo's Corporate Network,0.013218,# Juan R. Figuereo's Corporate Network\n\nThis...,7.5
173,32,Public Company Accounting Oversight & Internal...,0.011494,# Public Company Accounting Oversight & Intern...,8.5
174,104,Employee Compensation Limits Community,0.009770,# Employee Compensation Limits Community\n\nTh...,6.5
175,157,COSO Internal Control Framework - Newell Brand...,0.008621,# COSO Internal Control Framework - Newell Bra...,7.5


In [73]:
# inspect number of LLM calls and tokens
print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")

LLM calls: 11. LLM tokens: 86999
