In [None]:
#inspiration: https://medium.com/@jerryjliu98/how-unstructured-and-llamaindex-can-help-bring-the-power-of-llms-to-your-own-data-3657d063e30d

In [None]:
# !pip install llama-index
# !pip install pandas
# !pip install langchain


In [1]:
#enter open api key here

import os
os.environ['OPENAI_API_KEY'] = "Enter your API key here"

In [2]:
from llama_index import download_loader, GPTSimpleVectorIndex
from pathlib import Path
from langchain.document_loaders import UnstructuredFileLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.question_answering import load_qa_chain

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# enter your csv file with abstracts here

csvfile = 'Enter name of csv file here'

filename = csvfile + ".csv"

import pandas as pd

data = pd.read_csv(filename)

In [4]:
# your data will be divided into chunks of 20

chunks = 20

total_files = int(data.shape[0]/20)

total_files

5

In [5]:
for i in range(0, total_files):
    datachunk = data.iloc[i*chunks:(i+1)*chunks]
    datachunk.to_csv(csvfile+str(i)+'.csv', index=False)


In [6]:
from llama_index import download_loader, GPTSimpleVectorIndex
from pathlib import Path

In [7]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

In [8]:
# reading the different files into a dictionary of lists

loader = UnstructuredReader()
doc_set = {}
all_docs = []

years = []
for i in range(0, total_files):
    years.append(i)


for year in years:
    file_path = Path(csvfile+'{}.csv'.format(year))
    year_docs = loader.load_data(file=file_path, split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dries.faems\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dries.faems\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Setup Service Context

In [None]:
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(chunk_size_limit=512)

### Setup a Vector Index for each document

In [None]:
# initialize simple vector indices
# NOTE: don't run this cell if the indices are already loaded! 
# Load indices from disk
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    cur_index.save_to_disk(f'index_{year}.json')

In [None]:
index_set

In [None]:
# Load indices from disk
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.load_from_disk(f'index_{year}.json', service_context=service_context)
    index_set[year] = cur_index

In [None]:
index_set

### Composing a Graph to synthesize information across all patents

We want our queries to aggregate/synthesize information across all patents. 

In [None]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.composability import ComposableGraph

In [None]:
# set summary text for each doc
summaries = {}
for year in years:
    summaries[year] = "Abstract part " + str(year)


In [None]:
summaries

In [None]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [None]:
graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)

In [None]:
graph.save_to_disk('graph.json')

In [None]:
graph = ComposableGraph.load_from_disk('graph.json', service_context=service_context)

### Setting Up the Query

We query about the fine-tuned patent database

In [None]:
risk_query_str = (
    "Describe a novel business model on the provided patents, using maximum 150 words. Indicate the numbers of the patents that are used.")

In [None]:
response = graph.query(risk_query_str)

In [None]:
print(response)