Extract Text from PDF and Segment by ToC

In [3]:
import fitz  # PyMuPDF
import re

# Function to detect headings and segment the PDF
def extract_pdf_text_by_headings(pdf_path, heading_pattern=r'(CHAPTER|SECTION)\s+\d+'):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Dictionary to store the sections and their corresponding text
    sections = {}
    current_section = None
    current_text = []
    
    # Set to track seen headings and avoid duplicates
    seen_headings = set()

    # Compile the heading pattern
    heading_regex = re.compile(heading_pattern, re.IGNORECASE)

    # Iterate through each page of the document
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text()

        # Split text into lines and check for headings
        for line in text.splitlines():
            line = line.strip()
            
            # If a heading is found, start a new section
            if heading_regex.match(line) and line not in seen_headings:
                # Save previous section text
                if current_section and current_text:
                    sections[current_section] = ' '.join(current_text)
                    current_text = []

                # Mark the heading as seen and start the new section
                current_section = line
                seen_headings.add(line)  # Add to the seen set
                print(f"Found unique heading: {current_section}")
            else:
                current_text.append(line)
    
    # Save the last section
    if current_section and current_text:
        sections[current_section] = ' '.join(current_text)
    
    doc.close()
    return sections




In [4]:
# Example usage
pdf_path = 'practice-standard-project-risk-management.pdf'  # Path to your PDF
sections = extract_pdf_text_by_headings(pdf_path)

# Print extracted sections
for section_title, section_content in sections.items():
    print(f"Extracted section: {section_title[:30]}...")  # Print first 30 characters of title for brevity

Found unique heading: CHAPTER 1 -  INTRODUCTION ...................................................................................................1
Found unique heading: CHAPTER 2 -  PRINCIPLES AND CONCEPTS ..............................................................................9
Found unique heading: CHAPTER 3 -  INTRODUCTION TO PROJECT RISK MANAGEMENT PROCESSES .....................13
Found unique heading: CHAPTER 4 -  PLAN RISK MANAGEMENT ...............................................................................19
Found unique heading: CHAPTER 5 -  IDENTIFY RISKS ................................................................................................25
Found unique heading: CHAPTER 6 -  PERFORM QUALITATIVE RISK ANALYSIS ..........................................................31
Found unique heading: CHAPTER 7 -  PERFORM QUANTITATIVE RISK ANALYSIS .......................................................37
Found unique heading: CHAPTER 8 -  PLAN RISK RESPONSES ...

In [5]:
import pandas as pd
# Convert the sections dictionary into a Pandas DataFrame
df = pd.DataFrame(list(sections.items()), columns=['Section Title', 'Section Content'])

# Display the DataFrame
print(df)

# Optionally, save the DataFrame to a CSV file
df.to_csv('extracted_sections.csv', index=False)

                                        Section Title  \
0   CHAPTER 1 -  INTRODUCTION .......................   
1   CHAPTER 2 -  PRINCIPLES AND CONCEPTS ............   
2   CHAPTER 3 -  INTRODUCTION TO PROJECT RISK MANA...   
3   CHAPTER 4 -  PLAN RISK MANAGEMENT ...............   
4   CHAPTER 5 -  IDENTIFY RISKS .....................   
5   CHAPTER 6 -  PERFORM QUALITATIVE RISK ANALYSIS...   
6   CHAPTER 7 -  PERFORM QUANTITATIVE RISK ANALYSI...   
7   CHAPTER 8 -  PLAN RISK RESPONSES ................   
8   CHAPTER 9 -  MONITOR AND CONTROL RISKS ..........   
9                                           CHAPTER 1   
10                           CHAPTER 1 − INTRODUCTION   
11  Chapter 11 of the  PMBOK  ®  Guide –  Fourth E...   
12                                          CHAPTER 2   
13                CHAPTER 2 − PRINCIPLES AND CONCEPTS   
14                                          CHAPTER 3   
15  CHAPTER 3 − INTRODUCTION TO PROJECT RISK MANAG...   
16                             

In [6]:
print(df.describe())

                                            Section Title  \
count                                                  29   
unique                                                 29   
top     CHAPTER 1 -  INTRODUCTION .......................   
freq                                                    1   

                                          Section Content  
count                                                  29  
unique                                                 29  
top     Project Management Institute PRACTICE STANDARD...  
freq                                                    1  


In [7]:
print(df.head())

                                       Section Title  \
0  CHAPTER 1 -  INTRODUCTION .......................   
1  CHAPTER 2 -  PRINCIPLES AND CONCEPTS ............   
2  CHAPTER 3 -  INTRODUCTION TO PROJECT RISK MANA...   
3  CHAPTER 4 -  PLAN RISK MANAGEMENT ...............   
4  CHAPTER 5 -  IDENTIFY RISKS .....................   

                                     Section Content  
0  Project Management Institute PRACTICE STANDARD...  
1  2.1 Introduction ................................  
2  3.1 Project Risk Management and Project Manage...  
3  4.1 Purpose and Objectives of the Plan Risk Ma...  
4  5.1 Purpose and Objectives of the Identify Ris...  


In [10]:
print(sections.keys())

dict_keys(['CHAPTER 1 -  INTRODUCTION ...................................................................................................1', 'CHAPTER 2 -  PRINCIPLES AND CONCEPTS ..............................................................................9', 'CHAPTER 3 -  INTRODUCTION TO PROJECT RISK MANAGEMENT PROCESSES .....................13', 'CHAPTER 4 -  PLAN RISK MANAGEMENT ...............................................................................19', 'CHAPTER 5 -  IDENTIFY RISKS ................................................................................................25', 'CHAPTER 6 -  PERFORM QUALITATIVE RISK ANALYSIS ..........................................................31', 'CHAPTER 7 -  PERFORM QUANTITATIVE RISK ANALYSIS .......................................................37', 'CHAPTER 8 -  PLAN RISK RESPONSES ...................................................................................43', 'CHAPTER 9 -  MONITOR AND CONTROL RISKS ..................

In [11]:
import spacy
import nltk
from nltk.tokenize import word_tokenize

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Example list of known expressions related to project management
known_expressions = ["project risk", "risk management", "risk assessment", "stakeholder", "project schedule"]

# Tokenization function based on expression
def tokenize_expressions(text, expressions=known_expressions):
    # Convert text to lower case
    text = text.lower()
    
    # Replace known multi-word expressions with a single token (joining words by '_')
    for expr in expressions:
        text = text.replace(expr, expr.replace(' ', '_'))
    
    # Apply standard spaCy NLP pipeline
    doc = nlp(text)
    
    # Extract meaningful tokens (noun chunks, proper nouns, verbs)
    tokens = [chunk.text for chunk in doc.noun_chunks]  # Expression-based tokenization
    tokens += [token.lemma_ for token in doc if token.pos_ in ['VERB', 'NOUN']]
    
    return tokens




In [12]:
# Example: Tokenize one of the sections
section_text = sections['CHAPTER 1 -  INTRODUCTION ...................................................................................................1']  # Replace with actual section title
tokens = tokenize_expressions(section_text)

# Print tokenized expressions
print(f"Tokenized expressions: {tokens}")

Tokenized expressions: ['project management institute practice standard', 'project_risk_management isbn', ':  project management institute', 'inc', '14 campus boulevard  newtown square', 'pennsylvania 19073-3299 usa', 'phone', '+610-356-4600  fax', '+610-356-4647  e', '-', 'mail', 'customercare@pmi.org internet', 'inc', 'pmi', 'the pmi logo', 'the pmp logo', 'pmbok', 'pgmp”, “project management journal', '“pm network', 'the pmi today logo', 'registered marks', 'project management institute', 'inc', 'the quarter globe design', 'a trademark', 'the project management institute', 'inc', '.', 'a comprehensive list', 'pmi marks', 'the pmi legal department', 'pmi publications', 'corrections', 'comments', 'its books', 'comments', 'formatting', 'other errors', 'a copy', 'the relevant page', 'the book', 'the error', 'it', 'book editor', 'pmi publications', '14 campus boulevard', 'newtown square', 'discounts', 'resale or educational purposes', 'the pmi book service center', 'ga', 'the u.s', '.', 

In [13]:
# Combine text extraction and tokenization for all sections
all_tokens = {}

for section_title, section_text in sections.items():
    print(f"Processing section: {section_title}")
    tokens = tokenize_expressions(section_text)
    all_tokens[section_title] = tokens

# Print the tokenized expressions for each section
for section_title, tokens in all_tokens.items():
    print(f"Section: {section_title}")
    print(f"Tokens: {tokens[:10]}...")  # Displaying only first 10 tokens for brevity


Processing section: CHAPTER 1 -  INTRODUCTION ...................................................................................................1
Processing section: CHAPTER 2 -  PRINCIPLES AND CONCEPTS ..............................................................................9
Processing section: CHAPTER 3 -  INTRODUCTION TO PROJECT RISK MANAGEMENT PROCESSES .....................13
Processing section: CHAPTER 4 -  PLAN RISK MANAGEMENT ...............................................................................19
Processing section: CHAPTER 5 -  IDENTIFY RISKS ................................................................................................25
Processing section: CHAPTER 6 -  PERFORM QUALITATIVE RISK ANALYSIS ..........................................................31
Processing section: CHAPTER 7 -  PERFORM QUANTITATIVE RISK ANALYSIS .......................................................37
Processing section: CHAPTER 8 -  PLAN RISK RESPONSES ...................