In [2]:
import fitz  # PyMuPDF
import pandas as pd

In [3]:
def extract_text_from_pdf(pdf_path: str):
    """
    Extract text from each page of a PDF file using PyMuPDF and return a pandas DataFrame.

    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        pd.DataFrame: DataFrame with two columns: 'Page Number' and 'Text'.
    """
    try:
        # Open the PDF
        pdf_document = fitz.open(pdf_path)
        
        # Extract text page by page
        pages_text = []
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text()  # Extract text using PyMuPDF
            if text.strip():  # Append only if the page contains non-empty text
                pages_text.append({'Page Number': page_num + 1, 'Text': text.strip()})
        
        # Close the document
        pdf_document.close()
        
        # Convert the data into a DataFrame
        df = pd.DataFrame(pages_text)
        return df
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()

In [4]:
# Input PDF file path
pdf_file_path = "./bc_notes.pdf"

# Extract text and create a DataFrame
text_dataframe = extract_text_from_pdf(pdf_file_path)

# Save the DataFrame to a CSV file for reference
text_dataframe.to_csv("extracted_text.csv", index=False)

# Display the DataFrame
print(text_dataframe)

    Page Number                                               Text
0             1  Introduction to Blockchain - Day III + Day IV\...
1             2  Problem\nIn ScroogeCoin, suppose Mallory has b...
2             3  Challenges of DCP in Bitcoin\nThere are number...
3             4  How do nodes in P2P Network receive transactio...
4             5  Ordering I\nBlockchain transactions (or blocks...
..          ...                                                ...
64           65  Incentivization of Bitcoin\nTransaction Fees\n...
65           66  Problem with selection of node\nProblems to re...
66           67  Problem with selection of node\nProblems to re...
67           68  Problem with selection of node\nProblems to re...
68           69  Problem with selection of node\nProblems to re...

[69 rows x 2 columns]


In [7]:
df = pd.read_csv('./extracted_text.csv')
df["Text"].to_list()

['Introduction to Blockchain - Day III + Day IV\n8.01.2025/ 13.01.2025',
 'Problem\nIn ScroogeCoin, suppose Mallory has banking id of Alice and she\ncan login using that banking id provided she gets hold of the\npassword of Alice. Consider that password of Alice is an 8 digit\nnumber, each digit can be between 0 to 9. She tries generating the\npassword until it matches the password of Alice. How long will it\ntake before she succeeds, on average? What will happen if she can\ncrack the password?',
 'Challenges of DCP in Bitcoin\nThere are number of technical problems with the approach\nConsensus in general is a hard problem since nodes might\ncrash or be outright malicious\nIn the Bitcoin context, P2P network is highly imperfect.\nNot all pairs of nodes are connected to each other.\nThere could be faults in the network because of poor Internet\nconnectivity\nA lot of latency in the system because it’s distributed all over\nthe Internet.',
 'How do nodes in P2P Network receive transactio