# Chapter 2. Indexing: Preparing Your Documents for LLMs

In [27]:
# Install the required packages
%pip install pypdf -q
%pip install faiss-cpu -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [15]:
# Import modules
import os
from dotenv import load_dotenv
from uuid import uuid4
from langchain_openai import OpenAIEmbeddings
from langchain_openai.llms import OpenAI
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

# Load the files

In [16]:
# Load the txt file
# Use the Teamspace tab, find "this_studio" and upload the file to that folder; the uploads folder can't be found by this studio
loader = TextLoader("/teamspace/studios/this_studio/TeachingwithGenerativeAI.txt")
loader.load()

[Document(metadata={'source': '/teamspace/studios/this_studio/TeachingwithGenerativeAI.txt'}, page_content="\ufeffTeaching with Generative AI\n\nStudents and faculty report growing use of generative AI—tools that produce human-like writing (e.g ChatGPT), images (e.g. MidJourney), code (e.g. Microsoft Co-Pilot) and the like. The flexibility of these tools mean that there is no current default for acceptable vs. unacceptable use of these tools in coursework, and student adoption is moving faster than faculty adaptation. Many students are using AI without clear directions from their instructors about which uses are acceptable.\xa0\nFaculty should explain to students what is and is not allowed\xa0around AI use in their classes. (We know some faculty believe students are not using AI in their classes. If this is you, we assure you you are wrong.) The Provost’s Office recommends that faculty adopt the following three principles for student use of AI:\n* When a student uses these tools, they 

In [17]:
# Load a webpage with HTML content
loader = WebBaseLoader("https://ai.google/discover/blogs/")
loader.load()

[Document(metadata={'source': 'https://ai.google/discover/blogs/', 'title': 'Blogs – Google AI', 'description': 'At Google, we think the impact of AI will be most powerful when everyone can use it. Explore our tools.', 'language': 'en'}, page_content="\n\n\n\n\n\n\nBlogs – Google AI\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAI\n\n\n\n\n\n\n\n\n\n\n\n\n\nAI\n\n\n\n              Why AI\n              \n\n\n\n\n\n\n              Responsibility\n              \n\n\n\n\n\n\n\n\n\n\n\nResponsibility\n\n\n\nAI Principles\nObjectives for building beneficial AI\n\n\n\n\nResponsible AI Practices\nGuiding responsible AI development\n\n\n\n\nAI Governance & Operations\nOur review and approval process\n\n\n\n\nSocial Good\nAddressing societal challenges with AI\n\n\n\n\nPolicy\nOur contributions to AI Governance\n\n\n\n\n\n\n\nFEATURED RESEARCH\n\nAI for the benefit of humanity\n\n\n\n\nEXAMPLES OF OUR WORK\n\nImproving skin tone evaluation in 

In [18]:
# Load a PDF file (using relative path)
loader = PyPDFLoader("FallConference.pdf")
pages = loader.load()

# Split the files into chunks

In [19]:
# Split the document into chunks
loader = TextLoader("TeachingwithGenerativeAI.txt")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splitted_docs = splitter.split_documents(docs) # split_document accepts a list of documents
print(len(splitted_docs))
print(splitted_docs[0])
print()
print(splitted_docs[1])
print()
print(splitted_docs[2])

6
page_content='﻿Teaching with Generative AI' metadata={'source': 'TeachingwithGenerativeAI.txt'}

page_content='Students and faculty report growing use of generative AI—tools that produce human-like writing (e.g ChatGPT), images (e.g. MidJourney), code (e.g. Microsoft Co-Pilot) and the like. The flexibility of these tools mean that there is no current default for acceptable vs. unacceptable use of these tools in coursework, and student adoption is moving faster than faculty adaptation. Many students are using AI without clear directions from their instructors about which uses are acceptable. 
Faculty should explain to students what is and is not allowed around AI use in their classes. (We know some faculty believe students are not using AI in their classes. If this is you, we assure you you are wrong.) The Provost’s Office recommends that faculty adopt the following three principles for student use of AI:
* When a student uses these tools, they should acknowledge that use
* The studen

In [20]:
# Split python code into chunks
python_code = """
# Define a function to calculate the sum, product, and difference of two numbers
def calculate(a, b):
    return a + b, a * b, a - b
# Call the function with sample values
result1, result2, result3 = calculate(1, 2)
print(result1, result2, result3)
"""

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, 
    chunk_size=50, 
    chunk_overlap=0
)

python_docs = python_splitter.create_documents([python_code]) # create_documents accepts a list of strings
print(python_docs[0].page_content)
print()
print(python_docs[1].page_content)
print()
print(python_docs[2].page_content)

# Define a function to calculate the sum,

product, and difference of two numbers

def calculate(a, b):


In [21]:
# split a markdown text into chunks
markdown_text = """
# Welcome to the Studio
This is a Markdown file. You can use it to write and organize your thoughts.
## Create a new file
To create a new file, click on the "New File" button in the top right corner of the screen.
"""

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, 
    chunk_size=60, 
    chunk_overlap=0
)
md_docs = md_splitter.create_documents([markdown_text], [{"source": "www.example.com"}])
print(md_docs[0].page_content)
print()
print(md_docs[1].page_content)
print()
print(md_docs[2].page_content)
print()
print(md_docs[3].metadata)

# Welcome to the Studio

This is a Markdown file. You can use it to write and

organize your thoughts.

{'source': 'www.example.com'}


# Create embeddings for your documents

In [22]:
# Innitiate the OpenAI embedding model
embed_model = OpenAIEmbeddings()

# Embed a list of strings
embeddings = embed_model.embed_documents([
    "Good morning!",
    "I love you!",
    "Thank you!",
    "Bye!",
    "You are welcome!",
])
print(embeddings)
print()
print(f"Embeddings shape: {len(embeddings), len(embeddings[0])}")

[[-0.00695313885807991, -0.015867900103330612, -0.0007344369660131633, -0.009008469060063362, -0.02391429990530014, 0.01881658099591732, -0.013094141148030758, 0.0015430595958605409, 0.010570270009338856, -0.006459609605371952, 0.03306020796298981, 0.007184285204857588, -0.024026749655604362, -0.027187835425138474, 0.006640778388828039, -0.01796695962548256, 0.03261040896177292, -0.004076301120221615, 0.02771260030567646, -0.0073717013001441956, -0.006846936419606209, 0.01078892219811678, -0.0029315007850527763, -0.012844252400100231, -0.008439972996711731, -0.0014915200881659985, 0.009258356876671314, -0.015443089418113232, 0.011513598263263702, -0.018091903999447823, 0.009439526125788689, -0.011813463643193245, -0.012107082642614841, 0.004438638687133789, 0.0007766056223772466, -0.030361413955688477, -0.0077277920208871365, -0.007621589582413435, -0.006222215946763754, -0.015443089418113232, 0.010532787069678307, -4.436491508386098e-05, 0.01144487876445055, -0.03171081095933914, -0.0

In [23]:
# A complete example of embedding a document (from file loading to chunking and embedding)

## Load the document 
loader = TextLoader("TeachingwithGenerativeAI.txt")
doc = loader.load()

## Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
)
chunks = text_splitter.split_documents(doc)

## Generate embeddings
embed_model = OpenAIEmbeddings()
embeddings = embed_model.embed_documents([chunk.page_content for chunk in chunks])
print(f"Embeddings shape: {len(embeddings), len(embeddings[0])}")

Embeddings shape: (6, 1536)


# Storing embeddings in a vector store

In [24]:
# Indexing to build a vector database
vector_db = FAISS.from_documents(
    documents = chunks, 
    embedding = embed_model)

In [25]:
# Use similarity search to find similar documents to a query
similar_docs = vector_db.similarity_search("How does a professor assess the appropriateness of a student's work?", k=2)
print(similar_docs[0].page_content)
print()
print(similar_docs[1].page_content)

* External Patterns: Writing does not match a student's previous work  (particularly work produced in class), lack of rough drafts or evidence of editing, footnotes or references not related to the body of the text, footnotes or references pointing to work that does not exist.
* Ask the student if they used generative AI on the assignment in inappropriate or unacknowledged ways, given the evidence. One possible response if they say Yes (and if it is in line with your school’s policies) is to require them to redo the work, providing evidence of editing
* If they deny using these tools but you continue to suspect that they used them, involve your school administration.
While there are a number of products that purport to positively identify AI-generated writing, they have high error rates, especially for students for whom English is an additional language. NYU does not license or endorse use of any of these tools.

Detecting and adjudicating inappropriate use of generative AI is harder t

In [26]:
# Add more documents to the index vector database

# Create new Document objects
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

# Add the Documents to a list
documents = [
    document_1,
    document_2,
    document_3,
]

# Create a UUID for each Document
uuids = [str(uuid4()) for _ in range(len(documents))]

# Add the Documents to the vector database
vector_db.add_documents(documents=documents, ids=uuids)

['2f62ed51-64dd-4c3a-982f-36ec2a917f12',
 '6f26ca4a-33a8-4aa6-b479-53483f5c775f',
 '0d74e7fb-93c9-4994-972e-192fd1e4ca0c']

In [27]:
# Use similarity search to find similar documents to a query
similar_docs = vector_db.similarity_search("How is the weather tomorrow", k=1)
print(similar_docs[0].page_content)

The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.
