In [1]:
# pip install langchain --upgrade
# Version: 0.0.164

# !pip install pypdf

In [1]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [4]:
loader = PyPDFLoader(r"C:\Users\user2\Desktop\GitRepos\Learning-NLP\Builing a Language Model\sample-BRICS.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [5]:
data = loader.load()

In [17]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {sum([len(d.page_content) for d in data])} characters in your document')

You have 3 document(s) in your data
There are 5986 characters in your document


### Chunk your data up into smaller documents

In [18]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [46]:
texts

[Document(page_content="TEHRAN – Iran is among the six countries that on Thursday were invited to fully join the \nBRICS group of emerging economies.   \nDuring a BRICS summit held in Johannesburg, South African President Cyril Ramaphosa \nannounced the BRICS member states have agreed to admit Iran, Argentina, Egypt, Ethiopia, \nthe UAE and Saudi Arabia as full members. That means the bloc currently consisting of \nBrazil, Russia, India, China and South Africa, will double in the number of members as of the \nbeginning of next year.  \nIranian President Ebrahim Raisi who traveled to South Africa to attend the summit has \ncalled the advantages of Iran's membership in the  bloc “history -making”.   \n“Strategic cooperation between Iran and BRICS members in the fields of transit, energy, \nand trade, will support the BRICS global agenda. The Islamic Republic of Iran strongly \nsupports the successful efforts of BRICS in the path of de-dollarization of economic \nrelations between members

In [19]:
print (f'Now you have {len(texts)} documents')

Now you have 5 documents


### Create embeddings of your documents to get ready for semantic search

In [21]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [22]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
# OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '5e01a21c-b757-4537-8044-c76e016bbb92')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-west4-gcp-free')
# You may need to switch with your env

In [24]:
from chromadb.utils import embedding_functions
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
embeddings = embedding_functions.DefaultEmbeddingFunction()

In [26]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchaintest" # put in the name of your pinecone index here

In [45]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

MaxRetryError: HTTPSConnectionPool(host='controller.us-west4-gcp-free.pinecone.io', port=443): Max retries exceeded with url: /databases (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000026918C014D0>, 'Connection to controller.us-west4-gcp-free.pinecone.io timed out. (connect timeout=None)'))

In [14]:
# query = "What are examples of good data science teams?"
query = "how important is Iran's membership in BRICS?"
docs = docsearch.similarity_search(query)

In [17]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

Intelligence and cloud infrastructure development  
work. We saw the need for a  
new approach to distill value 
from our clients’ data. We 
approached the problem 
with a multidisciplinary 
team of computer scientists, 
mathematicians and domain 
experts. They immediately 
produced new insights and 
analysis paths, solidifying the 
validity of the approach. Since 
that time, our Data Science  
team has grown to 250 staff 
supporting dozens of cl


### Query those docs to get your answer back

In [40]:
# from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [41]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

In [50]:
# llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="D:\LLM Files\llama-2-7b-chat.ggmlv3.q4_0.bin",
    # model_path="D:\LLM Files\wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,
)
chain = load_qa_chain(llm, chain_type="stuff")

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


In [43]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query)

NameError: name 'docsearch' is not defined

In [53]:
query = "how important is Iran's membership in BRICS?"
chain.run(input_documents=texts, question=query)

ValueError: Requested tokens (1642) exceed context window of 512