This notebook generally serves as a prototype to ingest split documents/data into smaller chunks of text and embed them into the pinecone database.

Below is all of the installs used

In [3]:
%pip install pymupdf4llm
%pip install langchain
%pip install langchain_community
%pip install langchain_openai
%pip install langchain_core
!pip install sentence_transformers
!pip install -qU\
pinecone-client==3.0.0
%pip install langchain_pinecone

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.24-py3-none-any.whl (28 kB)
Collecting pymupdf>=1.25.5
  Using cached pymupdf-1.25.5-cp39-abi3-macosx_10_9_x86_64.whl (19.4 MB)
Installing collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.25.5 pymupdf4llm-0.0.24
You should consider upgrading via the '/Users/ehimenagbonkhese/Downloads/220540069FPY/envv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/ehimenagbonkhese/Downloads/220540069FPY/envv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/ehimenagbonkhese/Downloads/220540069FPY/envv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/Users/ehimenagbonkhese/Downloads/2205

#needed API keys - deleted during submission, dont hesitate to ask for any of them

In [1]:
from dotenv import load_dotenv
import os
load_dotenv('secrets.env')

PINECONE_APIKEY = os.getenv('PINECONE_APIKEY')
OPEN_AI_APIKEY = os.getenv('OPEN_AI_APIKEY')

In [2]:
import pymupdf4llm
import pathlib
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter, MarkdownTextSplitter
from langchain_core.documents import Document

EECSHB = pymupdf4llm.to_markdown("QMUL_EECS_student_handbook_2024-25.pdf")

pathlib.Path("handbook.md").write_bytes(EECSHB.encode())
header = MarkdownHeaderTextSplitter(headers_to_split_on=[("##", 'header1'),("###",'header2')], strip_headers = False)

ModuleNotFoundError: No module named 'pymupdf4llm'

#chunks are split into headers

In [None]:
chunks = header.split_text(EECSHB)
print(len(chunks))


39


#Measuring token lengths and finding chunks with token lengths higher than 250

In [None]:
import tiktoken

def measure_token_length(text):
    encoding = tiktoken.encoding_for_model("gpt-4")
    tokens = encoding.encode(text)
    return len(tokens)
x = 0
for i in range(len(chunks)):
  b = measure_token_length(chunks[i].page_content)
  if b > x :
    x = b

print(x)

#this finds chunks of text where thetoken size exceeds 250
def chunks_overflow(chunks:list[Document]):
  largechunks = []
  for i in range(len(chunks)):
    if measure_token_length(chunks[i].page_content) > 250:
      largechunks.append(i)

  return largechunks

largechunk = chunks_overflow(chunks)
for chunk in largechunk:
  print(chunk)

print(len)


1330
2
8
9
10
11
12
13
14
15
16
17
18
19
21
22
24
25
26
27
28
30
32
34
35
36
<built-in function len>


In [None]:
def reduce_chunk_size(chunks, largechunks):
  splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 80)
  tempchunks = []
  for index in largechunks:
    metadata = chunks[index].metadata
    tempchunks = splitter.split_text(chunks[index].page_content)
    tempchunks1 = set(tempchunks)
    chunks.remove(chunks[index])
    for i in tempchunks1:
      chunks.append(Document(page_content=i, metadata=metadata))
  return chunks

def measure_max_token_length(chunks):
  x = 0
  for i in range(len(chunks)):
    b = measure_token_length(chunks[i].page_content)

    if b > x :
      x = b
  return x

def reduce_chunk_size2(chunks,largechunk):
  x = measure_max_token_length(chunks)
  while (x > 300):
    chunks = reduce_chunk_size(chunks, largechunk)
    x = measure_max_token_length(chunks)
  return chunks



chunks = reduce_chunk_size2(chunks, largechunk)
print(measure_max_token_length(chunks))
print(len(chunks))




192
217


# Initializing the Pinecone database

In [6]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os
load_dotenv('../secrets.env')

PINECONE_APIKEY = os.getenv('PINECONE_APIKEY')

pc = Pinecone(api_key=PINECONE_APIKEY)
#vectorz = pc.create_index(name="qmuldocs",dimension=384, metric = "cosine", spec = ServerlessSpec(cloud = 'aws', region = 'us-east-1'))
vectors = pc.Index("qmuldocs")

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings

Embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  Embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from langchain_pinecone import PineconeVectorStore
vectors = PineconeVectorStore(index=vectors,embedding=Embedder)

In [None]:
EECS_ids = []
def create_ids(chunks,name):
  ids = []
  for i in range(len(chunks)):
    ids.append(name + str(i))

  return ids

EECS_ids = create_ids(chunks, "EECS_Student_handbook_")


EECS_link = "https://qmplus.qmul.ac.uk/mod/book/tool/print/index.php?id=2692210"

EECS_metadata = []
def create_metadata_with_link(chunks,name, link):
  metadata = []
  for chunk in chunks:
    chunk.metadata["link"] = link
    chunk.metadata["name"] = name
    metadata.append(chunk.metadata)
  return metadata

EECS_metadata = create_metadata_with_link(chunks,"EECS Student handbook",EECS_link)

print(len(EECS_metadata))




217


In [None]:
#cleaning the database due to re-adding chunks after fixing  chunking function
vectors.delete(delete_all = True, namespace="General")

In [None]:
vectors.add_documents(documents=chunks, ids=EECS_ids, namespace = "General")

['EECS_Student_handbook_0',
 'EECS_Student_handbook_1',
 'EECS_Student_handbook_2',
 'EECS_Student_handbook_3',
 'EECS_Student_handbook_4',
 'EECS_Student_handbook_5',
 'EECS_Student_handbook_6',
 'EECS_Student_handbook_7',
 'EECS_Student_handbook_8',
 'EECS_Student_handbook_9',
 'EECS_Student_handbook_10',
 'EECS_Student_handbook_11',
 'EECS_Student_handbook_12',
 'EECS_Student_handbook_13',
 'EECS_Student_handbook_14',
 'EECS_Student_handbook_15',
 'EECS_Student_handbook_16',
 'EECS_Student_handbook_17',
 'EECS_Student_handbook_18',
 'EECS_Student_handbook_19',
 'EECS_Student_handbook_20',
 'EECS_Student_handbook_21',
 'EECS_Student_handbook_22',
 'EECS_Student_handbook_23',
 'EECS_Student_handbook_24',
 'EECS_Student_handbook_25',
 'EECS_Student_handbook_26',
 'EECS_Student_handbook_27',
 'EECS_Student_handbook_28',
 'EECS_Student_handbook_29',
 'EECS_Student_handbook_30',
 'EECS_Student_handbook_31',
 'EECS_Student_handbook_32',
 'EECS_Student_handbook_33',
 'EECS_Student_handbook_

In [50]:
question = "what modules are in  year 1 of computer science Bsc(hons)"

x = vectors.similarity_search_with_score(question, k = 4, namespace = "General")
#print(x[0][1])
Generalcontext = vectors.similarity_search_with_score(question, k = 10, namespace = "General")

allcontext = Generalcontext
allcontext = [item for item in allcontext if item[1] > 0.5]
allcontext = sorted(allcontext, key = lambda x: x[1],reverse = True)
context = []
if len(allcontext) > 0:
    if 4 > len(allcontext):
        context = [allcontext[i][0] for i in range(len(allcontext))]
    else:
        context = [allcontext[i][0] for i in range(len(4))]


print(len(context))

print(context[0],[1])


1
page_content='overall pass for the module, but each module has its own specific assessment requirement.  
Project modules are usually assessed by means of a project report and an oral examination (viva), including a demonstration of system  
software where appropriate. Students should refer to the module web pages or ask the module organiser for details of the way in which  
different coursework elements in a module contribute to the final module assessment.  
**Module assessment hurdles**' metadata={'header1': '6. Writing and Assessment (web)', 'header2': '6.1. Assessment types', 'link': 'https://qmplus.qmul.ac.uk/mod/book/tool/print/index.php?id=2692210', 'name': 'EECS Student handbook'} [1]


Problem: duplicates were produced during the chunking process. This was due to a temporary store being persistent throughout the process. The problem was solved by clearing the store every pass

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
temp = "please answer the following question {question} using the context {context}. You are an assistant in Queen mary's university of london and you are talking to a student, please use a friendly demeanor."
prompt = ChatPromptTemplate.from_template(temp)


In [None]:
from langchain_openai import ChatOpenAI

llmodel = ChatOpenAI(model_name = 'gpt-4o', api_key=OPEN_AI_APIKEY)
chain = prompt | llmodel | StrOutputParser()

b = chain.invoke({"context":x, "question":question})
print(b + "\n more information at: " + x[0].metadata["link"])