In [35]:
import PyPDF2
import re
import pickle

## Extract the content -- helper function

In [36]:
# helper function to extract text and store metadata about the book and page number
#Note : the page number is number visible in the pdf viewing app not the embedded page in the 
def extract_text_from_pdf(pdf_path,book_name):
    text_with_metadata=[]
    with open(pdf_path,"rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            page_text=page.extract_text()
            text_with_metadata.append({'book_name':book_name,
                                           'page_number':page_num +1,
                                           'text':page_text if page_text else ""})
    return text_with_metadata





    

## textbook extraction

In [37]:
text_path_1 = "Textbooks/Introduction to Autonomous Mobile Robots book.pdf"
text_path_2 = "Textbooks/Introduction-to-Robotics-3rd-edition.pdf"
text_path_3 = "Textbooks/mataric-primer.pdf"
textbook_1 = extract_text_from_pdf(text_path_1,"Introduction to Autonomous Mobile Robots book")

In [38]:
print(len(textbook_1))

336


In [39]:
textbook_2 = extract_text_from_pdf(text_path_2,"Introduction-to-Robotics-3rd-edition")
print(len(textbook_2))

408


In [40]:
textbook_3 = extract_text_from_pdf(text_path_3,"mataric-primer")
print(len(textbook_3))

323


## Data chunking

In [41]:
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/haridevaraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [42]:

#helper function to clean text
def clean_testbook(text):
    
    #normalize white space
    text = re.sub(r'\s+',' ',text).strip()
    
    #replace multiple spaces with single space
    text = re.sub(r' {2,}',' ',text)
    
    #remove unwanted characters, keeping the punctuation
    #text = re.sub(r'[^\w\s\.,!?;]','',text)
    return text

#function to chunck text with metadata
def textbook_chunking(textbook,chunk_size=100):
    chunks_with_metadata = []
    for entry in textbook:
        book_name = entry['book_name']
        page_number = entry['page_number']
        text = entry['text']
        
        #clean the text
        text = clean_testbook(text)
        
        #tokenize text into sentences
        sentences = sent_tokenize(text)
        
        #chunk each sentence into short, contiguous texts of approx 100 tokens
        current_chunk = ""
        current_chunk_tokens = 0
        for sentence in sentences:
            tokenized_sentence = word_tokenize(sentence)
            if current_chunk_tokens + len(tokenized_sentence) > chunk_size:
                # if adding this sentence would exceed the chunk_size, start a new chunk
                if current_chunk:
                    chunks_with_metadata.append({
                        'book_name':book_name,
                        'page_number': page_number,
                        'text_chunk' : current_chunk.strip()
                    })
                current_chunk = sentence + " "
                current_chunk_tokens = len(tokenized_sentence)
            else:
                current_chunk += sentence + " "
                current_chunk_tokens += len(tokenized_sentence)
            
        #if any last chunk
        if current_chunk:
            chunks_with_metadata.append({
                'book_name' : book_name,
                'page_number' : page_number,
                'text_chunk' : current_chunk.strip()
            })
            
    return chunks_with_metadata
                
        
        
    
    
    







In [43]:
textbook_chunk_1 = textbook_chunking(textbook_1)
textbook_chunk_2 = textbook_chunking(textbook_2)
textbook_chunk_3 = textbook_chunking(textbook_3)
    


In [44]:
# combined 
combined_textbook_chunk = textbook_chunk_1 + textbook_chunk_2 + textbook_chunk_3

In [None]:
len(combined_textbook_chunk)

4497

In [None]:
#helper function to embed text chunks with metadata
def embed_text_chunks(textbook_chunk_metadata,model):
    for chunk in textbook_chunk_metadata:
        text_chunk = chunk['text_chunk']
        embedding = model.encode(text_chunk)
        chunk['embedding'] = embedding
    return textbook_chunk_metadata

## save the textbook chunks as pickle files 

In [None]:
#pickle file helper functions
def save_chunk_to_pickle(embedding_with_metadata,file_path):
    with open(file_path,'wb') as f:
        pickle.dump(embedding_with_metadata,f)

def load_chunk_to_pickle(file_path):
    with open(file_path,'rb') as f:
        embedding_with_metadata = pickle.load(f)
    return embedding_with_metadata

In [45]:
combined_textbook_chunk

[{'book_name': 'Introduction to Autonomous Mobile Robots book',
  'page_number': 1,
  'text_chunk': 'Autonomous Mobile RobotsIntroduction toRoland Illah R.SIEGWART NOURBAKHSH Autonomous Mobile Robots SIEGWART and NOURBAKHSHIntroduction to Introduction to Autonomous Mobile Robots Roland Siegwart and Illah R. Nourbakhsh Mobile robots range from the teleoperated Sojourner on the Mars Pathfinder mission to cleaning robots in the Paris Metro. Introduction to Autonomous Mobile Robots offers students and other interested readers an overview of the technology of mobility—the mechanisms that allow a mobile robot to movethrough a real world environment to perform its tasks—including locomotion,sensing, localization, and motion planning.'},
 {'book_name': 'Introduction to Autonomous Mobile Robots book',
  'page_number': 1,
  'text_chunk': 'It discusses all facets of mobile robotics,including hardware design, wheel design, kinematics analysis, sensors and per-ception, localization, mapping, and ro

In [46]:

save_chunk_to_pickle(combined_textbook_chunk,'combined_textbook_chunk_metadata.pkl')

