In [1]:
import getpass
import os

# os.environ["OPENAI_API"] = getpass.getpass()

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader = PyPDFLoader("../data/Week-7.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API"))
db = FAISS.from_documents(docs, embeddings)
print(db.index.ntotal)

12


In [5]:
query = "What is the deliverables"
docs = db.similarity_search(query)

In [6]:
print(docs[0].page_content)

Deliverables
NOTE:
Document
should
be
a
PDF
stored
in
google
drive
or
published
blog
link.
DO
NOT
SUBMIT
A
LINK
as
PDF!
If
you
want
to
submit
pdf
document,
it
should
be
the
content
of
your
report
not
a
link.
Interim
Submission
-
Wednesday
8pm
UTC
●
Link
to
your
code
in
GitHub
○
Repository
where
you
will
be
using
to
complete
the
tasks
in
this
week's
challenge.
A
minimum
requirement
is
that
you
have
a
well
structured
repository
and
some
coding
progress
is
made.
●
A
review
report
of
your
reading
and
understanding
of
Task
1
and
any
progress
you
made
in
other
tasks.
Feedback
You
may
not
receive
detailed
comments
on
your
interim
submission
but
will
receive
a
grade.
Final
Submission
-
Saturday
8pm
UTC
●
Link
to
your
code
in
GitHub
○
Complete
work
for
Automatic
prompt
generation
○
Complete
work
for
Automatic
evaluation
○
Complete
work
for
Evaluation
Data
Generation
●
A
blog
post
entry
(which
you
can
submit
for
example
to
Medium
publishing)
or
a
pdf
report.
Feedback
You
will
receive
comments/fe

In [7]:
docs_and_scores = db.similarity_search_with_score(query)

In [8]:
docs_and_scores[0]

(Document(page_content="Deliverables\nNOTE:\nDocument\nshould\nbe\na\nPDF\nstored\nin\ngoogle\ndrive\nor\npublished\nblog\nlink.\nDO\nNOT\nSUBMIT\nA\nLINK\nas\nPDF!\nIf\nyou\nwant\nto\nsubmit\npdf\ndocument,\nit\nshould\nbe\nthe\ncontent\nof\nyour\nreport\nnot\na\nlink.\nInterim\nSubmission\n-\nWednesday\n8pm\nUTC\n●\nLink\nto\nyour\ncode\nin\nGitHub\n○\nRepository\nwhere\nyou\nwill\nbe\nusing\nto\ncomplete\nthe\ntasks\nin\nthis\nweek's\nchallenge.\nA\nminimum\nrequirement\nis\nthat\nyou\nhave\na\nwell\nstructured\nrepository\nand\nsome\ncoding\nprogress\nis\nmade.\n●\nA\nreview\nreport\nof\nyour\nreading\nand\nunderstanding\nof\nTask\n1\nand\nany\nprogress\nyou\nmade\nin\nother\ntasks.\nFeedback\nYou\nmay\nnot\nreceive\ndetailed\ncomments\non\nyour\ninterim\nsubmission\nbut\nwill\nreceive\na\ngrade.\nFinal\nSubmission\n-\nSaturday\n8pm\nUTC\n●\nLink\nto\nyour\ncode\nin\nGitHub\n○\nComplete\nwork\nfor\nAutomatic\nprompt\ngeneration\n○\nComplete\nwork\nfor\nAutomatic\nevaluation\n○\nCom

In [9]:
embedding_vector = embeddings.embed_query(query)
docs_and_scores = db.similarity_search_by_vector(embedding_vector)