In [1]:
!pip -q install chromadb openai langchain tiktoken

In [2]:
!pip show chromadb

Name: chromadb
Version: 1.3.4
Summary: Chroma.
Home-page: https://github.com/chroma-core/chroma
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: c:\users\hli\appdata\roaming\python\python310\site-packages
Requires: bcrypt, build, grpcio, httpx, importlib-resources, jsonschema, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-sdk, orjson, overrides, posthog, pybase64, pydantic, pypika, pyyaml, rich, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 


In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
!unzip -q new_articles.zip -d new_articles

In [None]:
# set up environement
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader,TextLoader

os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
# load data
loader=DirectoryLoader("/content/new_articles/",glob="./*.txt",loader_cls=TextLoader) # to load tons of .txt files
document=loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(document)

# create db
from langchain import embeddings
persist_directory='db'
embedding=OpenAIEmbeddings()
verctordb=Chroma.from_documents(documents=text,embedding=embedding,persist_directory=persist_directory)
# make the db persist to disk
vectordb.persist()
verctordb=None()
# now we can load the persistent database from disk and use it as normal
vectordb=Chroma(persist_directory=persist_directory,embedding_function=embedding)

# make a retriever
query="How much money did Microsoft raise?"
retriever=vectordb.as_retriever()
docs=retriever.get_relevant_documents(query)
len(docs)
retriever.search_kwargs
retriever.search_type

# make a chain
llm=OpenAI()
# create a chain to answer the question
qa_chain=RetrivalQA.from_chain_type(llm=OpenAI(),
                                    chain_type="stuff",
                                    retriever=retriever,
                                    return_source_documents=True)
# cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    for source in llm_response['source_documents']:
        print(source.metadata['source'])

llm_response=qa_chain(query)
process_llm_response(llm_response)

# deleting the db
!zip -r db.zip ./db
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

In [None]:
# Pinecone
!pip install langchain
!pip install pinecone-client
!pip install pypdf
from langchain.document_loader import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrieveQA
from langchain.prompts import PromptTemplate
import os

# load file
!mkdir pdfs
!gdown 1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE -O pdfs/yolov7paper.pdf
!gdown 1vILwiv6nS2wI3chxNabMgry3qnV67TxM -O pdfs/rachelgreecv.pdf

# extract the text from the pdfs
loader=PyPDFDirectoryLoader("pdfs")
data=loader.load()
# split the texts into chunks
text_splitter=RecursiveCharacterTextSplitter(chunk_size=00,chunk_overlap=20)
text_chunks=text_splitter.split_documents(data)

import os
os.environ['OPENAI_API_KEY'] = ""
embeddings=OpenAIEmbeddings()
result=embeddings.embed_query("how are you?")

# init pinecone
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 're45re12r-et45e-4965-9035-e09c00ad18a5')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

import pinecone
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "test" # put in the name of your pinecone index here

# create embeddings for each of the text chunk
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks],embeddings,index_name=index_name)
# if you already have an index, we can load it like this
docsearch=Pinecone.from_existing_index(index_name,embeddings)
# similarity search
query="YOLOv7 outperforms which models"
docs=docsearch.similarity_search(query,k=3)
docs
# createa a llm model wrapper
llm=OpenAI()
qa=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=docsearch.as_retriever())
qa.run(query)

import sys
while True:
    user_input=input("input:")
    if user_input="exit":
        sys.exist()
    if user_input="":
        continue
    result=qa({'query':user_input})
    print(f"answer:{result['result']})

In [None]:
# Weaviate
!pip install weaviate-client
!pip install langchain
!pip install openai
OPENAI_API_KEY = ""
WEAVIATE_API_KEY = ""
WEAVIATE_CLUSTER = "https://test-njh6t5hm.weaviate.network"
!mkdir data
!pip install unstructured
!pip install "unstructured[pdf]"

from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader("./data",glob = "**/*.pdf")
data = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

import weaviate
from langchain.vectorstores import Weaviate

#Connect to weaviate Cluster
auth_config = weaviate.auth.AuthApiKey(api_key = WEAVIATE_API_KEY)
WEAVIATE_URL = WEAVIATE_CLUSTER

client = weaviate.Client(
    url = WEAVIATE_URL,
    additional_headers = {"X-OpenAI-Api-key": OPENAI_API_KEY},
    auth_client_secret = auth_config,
    startup_period = 10
)
client.is_ready()

# define input structure
client.schema.delete_all()
client.schema.get()
schema={
    "classes":"Chatbot",
    "description":"Document for chatbot",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {"text2vec-openai":{"model":"ada","type":"text"}},
    "properties":[
        {
            "dataType":["text"],
            "description":"The content of the paragraph",
            "moduleConfig":{
                "text1vec-api":{
                    "skip":False,
                    "vectorizePropertyName":False,
                }

            },
            "name":"content",
        },
    ],
},],}
client.schema.create(schema)
vectorstore=Weaviate(client,"Chatbot",content,attribute=["source"])

# load text into the vectorstore
text_meta_pair=[(doc.page_content,doc.metadata) for doc in docs]
texts,meta=list(zip(*text_meta_pair))
vectorstore.add_texts(texts,meta)

# similarity mesurement
query="what is a yolo?"
docs=vectorstore.similarity_search(query,top_k=20)

# chain
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
chain=load_qa_chain(OpenAI(openai_api_key=OPENAI_API_KEY,temperature=0),chain_type="stuff")
chain.run(input_documents=docs,question=query) # create answer