In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load your data

In [3]:
loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [4]:
data = loader.load()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cinde\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cinde\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [5]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 200014 characters in your document


### Chunk your data up into smaller documents

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [7]:
print (f'Now you have {len(texts)} documents')

Now you have 248 documents


### Create embeddings of your documents to get ready for semantic search

In [9]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

OPENAI_API_KEY=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
TEMPERATURE=0.5
USE_AZURE=False

### AZURE
# cleanup azure env as already moved to `azure.yaml.template`

################################################################################
### LLM MODELS
################################################################################

# SMART_LLM_MODEL - Smart language model (Default: gpt-4)
# FAST_LLM_MODEL - Fast language model (Default: gpt-3.5-turbo)
SMART_LLM_MODEL=gpt-4
FAST_LLM_MODEL=gpt-3.5-turbo

### LLM MODEL SETTINGS
# FAST_TOKEN_LIMIT - Fast token limit for OpenAI (Default: 4000)
# SMART_TOKEN_LIMIT - Smart token limit for OpenAI (Default: 8000)
# When using --gpt3only this needs to be set to 4000.
FAST_TOKEN_LIMIT=4000
SMART_TOKEN_LIMIT=8000

################################################################################
### MEMORY
################################################################################

### MEMORY_BACKEND - Memory backend type
# local - Default
# pinecone - Pinecone (if configured)
# redis - Redis (if configured)
# milvus - Milvus (if configured)
MEMORY_BACKEND=local

### PINECONE
# PINECONE_API_KEY - Pinecone API Key (Example: my-pinecone-api-key)
# PINECONE_ENV - Pinecone environment (region) (Example: us-west-2)
PINECONE_API_KEY=feee5e99-142b-4eed-8c04-67c33e24d71d
PINECONE_ENV=us-east-1-aws

In [10]:
OPENAI_API_KEY = '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'
PINECONE_API_KEY = 'feee5e99-142b-4eed-8c04-67c33e24d71d'
PINECONE_API_ENV = 'us-east-1-aws'

In [11]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [12]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "longtext"

In [13]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [14]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query, include_metadata=True)

### Query those docs to get your answer back

In [15]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [16]:
llm = OpenAI(temperature=0.2, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [21]:
#query = "What is the collect stage of data maturity?"
query = "What is the most important data science skills and tools?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [22]:
chain.run(input_documents=docs, question=query)

' The most important data science skills and tools are curiosity, creativity, focus, attention to detail, and flexibility. Useful tools include feature hashing, wrapper methods, sensitivity analysis, self organizing maps, deduplication, normalization, format conversion, fast Fourier transform (FFT), discrete wavelet transform, and coordinate transform.'