In [30]:
!pip install openai langchain tiktoken chromadb

Collecting grpcio>=1.58.0 (from chromadb)
  Using cached grpcio-1.59.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
Installing collected packages: grpcio
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.57.0
    Uninstalling grpcio-1.57.0:
      Successfully uninstalled grpcio-1.57.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
google-cloud-pubsublite 1.8.2 requires overrides<7.0.0,>=6.0.1, but you have overrides 7.3.1 which is incompatible.
ray 2.5.1 requires grpcio<=1.51.3,>=1.42.0; python_version >= "3.10" and sys_platform != "darwin", but you have grpcio 1.59.0 which is incompatible.[0m[31m
[0mSuccessfully installed grpcio-1.59.0


In [32]:
import os 
os.environ["OPENAI_API_KEY"] = "Enter your api key"

## Document loader

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("/kaggle/input/cs224u-contextreps/cs224u-contextualreps-2023-handout.pdf")
docs = loader.load()

## Splitting into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150,
)

splits = text_splitter.split_documents(docs)
len(splits)

## Embeddings

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

## Vector store

In [None]:
from langchain.vectorstores import Chroma

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
)

print(vectordb._collection.count())

In [None]:
question = "What are the statical vecoter representations of words?"
docs = vectordb.similarity_search(question, k= 3)

In [None]:
len(docs)

In [None]:
docs[0].page_content

## model + prompt

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

llm = ChatOpenAI(model_name= "gpt-3.5-turbo-0301", temperature= 0)

template = """Given the following context answer the question at the end. If you don't know the answer return can't answer
{context}
Question: {question}
Answer:"""
qa_chain_prompt = PromptTemplate.from_template(template)

## Run Chain

In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = vectordb.as_retriever(),
    chain_type_kwargs = {"prompt": qa_chain_prompt} # You can choose whatever chaintype you want
)

In [None]:
question = "What are the statical vecoter representations of words?"

result = qa_chain({"query": question})
result["result"]

## Wrap everything together

In [33]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA



# Loader
loader = PyPDFLoader("/kaggle/input/cs224u-contextreps/cs224u-contextualreps-2023-handout.pdf")
docs = loader.load()


# Splitting into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150,
)
splits = text_splitter.split_documents(docs)


# Embeddings
embedding = OpenAIEmbeddings()


# Vector store
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
)


# Model + Prompt
llm = ChatOpenAI(model_name= "gpt-3.5-turbo-0301", temperature= 0)
template = """Given the following context answer the question at the end. If you don't know the answer return can't answer
{context}
Question: {question}
Answer:"""
qa_chain_prompt = PromptTemplate.from_template(template)


# Chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = vectordb.as_retriever(),
    chain_type_kwargs = {"prompt": qa_chain_prompt} # You can choose whatever chaintype you want
)

In [34]:
question = "What are the statical vecoter representations of words?"

result = qa_chain({"query": question})
result["result"]

'Static vector representations of words are various methods of representing words as vectors, including feature-based methods, count-based methods, classical dimensionality reduction, and learned dimensionality reduction. Examples of learned dimensionality reduction methods include autoencoders, word2vec, and GloVe.'

## Let's use memory

In [49]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever = vectordb.as_retriever(),
    memory = memory
)

In [50]:
question = "Is GPT one of the topics mentioned in the class context?"
result = qa({"question": question})
result['answer']

'Yes, GPT is mentioned in the class context.'

In [51]:
question = "What are those topics?"
result = qa({"question": question})
result['answer']

'The class context mentions guiding ideas, transformer, positional encoding, GPT, BERT, RoBERTa, ELECTRA, seq2seq, distillation, and contextual word representations. It also includes a brief history of contextual representation, which mentions GPT as being introduced in June 2018.'