In [75]:
import io
import re
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Step 1 - Extract Text from PDF as Binary File

In [76]:
with open('sample1.pdf', 'rb') as pdf_file:
    reader = PyPDF2.PdfReader(io.BytesIO(pdf_file.read()))
    text = ''
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()
text



In [77]:
# Remove tabs and newlines etc
cleaned_text = re.sub(r'[\t\n]', ' ', text)

cleaned_text




In [78]:
len(cleaned_text)

10525

# Step 2 - Chunk the Text Data

In [79]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 20
)

chunks = text_splitter.split_text(cleaned_text)
chunks, len(chunks)

(['Future Vignette Dixit FUTURE VIGNETTE Aryan Dixit From BANGALORE, INDIA Date 10/10/2040 Hi Lex! How have you been? I’m sorry I haven’ t been in SD for a while: I’ve run out of flight carbon credits for this year. The weather ’s also been weird. It’s been torrential rain for the past few weeks, and without the Metro, I’d have been cut off from the world for a good while. Good thing I’ve been working from home anyway ever since they returned to online school for my son. He’s doing excellent - he loves online school given that it’s his first time. But my wife is getting us flashbacks from the 2020 pandemic. We’re hoping to take this extended climate leave to visit her parents but as you know the carbon credit market has become really expensive after the Chinese reforms. Down here, I’ve been trying to get commitments from politicians and businessmen for the Rural Coast Protection Alliance. Not every exposed settlement in India is a megapolis like Mumbai - and the deflected sea has pushe

# Step 3 - Create vector embeddings for the chunks

In [80]:
transformer_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = transformer_model.encode(chunks)

embeddings

array([[ 0.01991415,  0.12412882, -0.02452128, ...,  0.03452208,
        -0.04589315, -0.05247775],
       [ 0.01163632,  0.06817073, -0.01341491, ...,  0.03701388,
        -0.02111758, -0.05861896],
       [-0.00487314,  0.08376364, -0.01711119, ...,  0.02850831,
        -0.03659141, -0.03194457],
       ...,
       [-0.0424286 ,  0.07058449,  0.0071744 , ..., -0.02234123,
        -0.05378294, -0.00617511],
       [-0.01390471,  0.1200045 , -0.01092025, ..., -0.01008429,
         0.00384011, -0.01601044],
       [-0.00042852,  0.0714843 , -0.00043344, ...,  0.02423266,
        -0.01087992,  0.00616978]], dtype=float32)