In [1]:
!pip install PyPDF2



In [2]:
import os
from PyPDF2 import PdfReader
import re

In [3]:
# Step 1: Extract text from the PDF

In [4]:
# Step 1: Extracting text from the PDF
def extract_text_from_pdf(file_path):
    
    text = ""
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
    return text

# Specify PDF file path
pdf_file_path = "Ng_MachineLearningYearning.pdf"
pdf_text = extract_text_from_pdf(pdf_file_path)

print(f"Extracted text from {pdf_file_path[:50]}...")

Extracted text from Ng_MachineLearningYearning.pdf...


In [5]:
print(f"Length of extracted text: {len(pdf_text)}")
if len(pdf_text) > 0:
    print(f"Preview of extracted text: {pdf_text[:500]}")  # Preview the first 500 characters
else:
    print("No text was extracted. The PDF may not contain extractable text.")


Length of extracted text: 161121
Preview of extracted text:  
 
 
 
 
 
 
Machine Learning Yearning is a
 
 
deeplearning.ai project.
 
 
 
 
 
 
 
 
 
 
 
© 2018 Andrew Ng. All Rights Reserved.
 
 
 
 
Page 2
Machine Learning Yearning-Draft
Andrew Ng
  
Table of Contents
 
 
1 Why Machine Learning Strategy
 
2 How to use this book to help your team
 
3 Prerequisites and Notation
 
4 Scale drives machine learning progress
 
5 Your development and test sets
 
6 Your dev and test sets should come from the same distribution
 
7 How large do the dev/test set


The text extraction step is complete. 
However, the preview reveals that there may be unnecessary whitespace or extra line breaks in the text.
This can be cleaned up during the preprocessing step.


In [7]:
import re

def preprocess_text(text):
    # Removing multiple spaces, tabs, and newlines
    cleaned_text = re.sub(r"\s+", " ", text)
    return cleaned_text.strip()

# Preprocess the extracted text
cleaned_pdf_text = preprocess_text(pdf_text)

# Check the cleaned text
print(f"Length of cleaned text: {len(cleaned_pdf_text)}")
print(f"Preview of cleaned text: {cleaned_pdf_text[:500]}")


Length of cleaned text: 153889
Preview of cleaned text: Machine Learning Yearning is a deeplearning.ai project. © 2018 Andrew Ng. All Rights Reserved. Page 2 Machine Learning Yearning-Draft Andrew Ng Table of Contents 1 Why Machine Learning Strategy 2 How to use this book to help your team 3 Prerequisites and Notation 4 Scale drives machine learning progress 5 Your development and test sets 6 Your dev and test sets should come from the same distribution 7 How large do the dev/test sets need to be? 8 Establish a single-number evaluation metric for you


The text is now more readable and cleaned from unnecessary line breaks, extra spaces, and special characters.
The content seems to be structured, starting with the book's introduction and table of contents, which is perfect for chunking and creating embeddings.

In [9]:
# Step 3: Chunk the Text into Manageable Pieces

In [10]:
def chunk_text(text, chunk_size=500):
    # Split text into chunks of the specified size
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Split the cleaned text into chunks
chunk_size = 500  
text_chunks = chunk_text(cleaned_pdf_text, chunk_size)

# Display the number of chunks and a preview of the first chunk
print(f"Total chunks created: {len(text_chunks)}")
print(f"First chunk preview: {text_chunks[0]}")


Total chunks created: 308
First chunk preview: Machine Learning Yearning is a deeplearning.ai project. © 2018 Andrew Ng. All Rights Reserved. Page 2 Machine Learning Yearning-Draft Andrew Ng Table of Contents 1 Why Machine Learning Strategy 2 How to use this book to help your team 3 Prerequisites and Notation 4 Scale drives machine learning progress 5 Your development and test sets 6 Your dev and test sets should come from the same distribution 7 How large do the dev/test sets need to be? 8 Establish a single-number evaluation metric for you


In [11]:
# Step 4: Generate Embeddings for Text Chunks

In [12]:
!pip install transformers faiss-cpu torch




In [13]:
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

# Load pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to generate embeddings for each chunk
def generate_embeddings(text_chunks):
    embeddings = []
    
    # Generate embeddings for each chunk of text
    for chunk in text_chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            # Use the mean of the token embeddings as the chunk embedding
            chunk_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(chunk_embedding)
    
    return np.array(embeddings)

# Generate embeddings for all chunks
embeddings = generate_embeddings(text_chunks)

print(f"Generated embeddings for {len(embeddings)} chunks.")
print(f"Shape of embeddings: {embeddings.shape}")

Generated embeddings for 308 chunks.
Shape of embeddings: (308, 384)


In [14]:
# Step 5: Build a FAISS Index

In [15]:
# We'll use the FAISS library to create an index for storing the embeddings. 
# FAISS allows us to perform fast similarity searches over the vector embeddings.

In [16]:
# Initialize FAISS index
dim = embeddings.shape[1]  # 384 dimensions per embedding
index = faiss.IndexFlatL2(dim)  # L2 distance (Euclidean distance)

# Add embeddings to the FAISS index
index.add(embeddings)

# Check the number of vectors in the index
print(f"Number of vectors in FAISS index: {index.ntotal}")

Number of vectors in FAISS index: 308


In [18]:
# Saving FAISS index
def save_faiss_index(index, chunks, filename="faiss_index.pkl"):
    data = {
        "index": index,       # The FAISS index
        "chunks": chunks,     # The text chunks
    }
    with open(filename, "wb") as f:
        pickle.dump(data, f)
    print(f"FAISS index and metadata saved to {filename}")

save_faiss_index(index, text_chunks)

FAISS index and metadata saved to faiss_index.pkl


In [None]:
#  Step 6: Querying the FAISS Index for Answers

Now we'll proceed with:
Converting the question into an embedding.
Searching the FAISS index for the most similar chunk.
Returning the corresponding chunk as the answer.

In [24]:
def query_faiss(question, index, top_k=3):
    # Step 1: Generate embedding for the question
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        question_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Step 2: Search the FAISS index for the top-k most similar chunks
    D, I = index.search(np.array([question_embedding]), top_k)  # D is distances, I is indices

    # Step 3: Retrieve the most similar chunks based on indices
    similar_chunks = [text_chunks[i] for i in I[0]]
    
    return similar_chunks, D[0]  # Return the chunks and their distances

# Test the query
question = "How to establish a single-number evaluation metric?"
top_k = 3
results, distances = query_faiss(question, index, top_k)

print(f"Top {top_k} similar chunks for the question: '{question}'")
for i, (chunk, distance) in enumerate(zip(results, distances)):
    print(f"\nRank {i+1} (Distance: {distance:.2f}):\n{chunk[:300]}...")  # Preview first 300 characters


Top 3 similar chunks for the question: 'How to establish a single-number evaluation metric?'

Rank 1 (Distance: 17.63):
cy is an example of a ​ single-number evaluation metric ​ : You run your classifier on the dev set (or test set), and get back a single number about what fraction of examples it classified correctly. According to this metric, if classifier A obtains 97% accuracy, and classifier B obtains 90% accurac...

Rank 2 (Distance: 19.76):
valuation metrics makes it harder to compare algorithms. Suppose your algorithms perform as follows: Classifier Precision Recall A 95% 90% B 98% 85% Here, neither classifier is obviously superior, so it doesn’t immediately guide you toward picking one. Classifier Precision Recall F1 score A 95% 90% ...

Rank 3 (Distance: 20.95):
ne of the most common ways to combine multiple metrics into one. 4 If you want to learn more about the F1 score, see ​ https://en.wikipedia.org/wiki/F1_score ​ . It is the “harmonic mean” between Precision and Recall, 

The question is converted to an embedding using the same model, and FAISS will compare this question embedding to the embeddings of our document chunks.
This allows us to retrieve the most relevant chunks that are semantically close to the question, which we can use for further processing or answering.