## Using PyPDF

In [10]:
import PyPDF2

def extract_pdf_metadata(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        
        # Extract document information (metadata)
        metadata = reader.metadata
        
        # Convert metadata to a dictionary
        metadata_dict = {}
        for key, value in metadata.items():
            metadata_dict[key] = value

    return metadata_dict

# Example usage
pdf_path = r"D:\Buku Pembelajaran Artificial Intelligence\Chip Huyen - Designing Machine Learning Systems_ An Iterative Process for Production-Ready Applications-O'Reilly Media (2022).pdf"
metadata = extract_pdf_metadata(pdf_path)
for key, value in metadata.items():
    print(f'{key}: {value}')


/Author: Huyen, Chip;
/CreationDate: D:20220517135306Z
/Creator: AH CSS Formatter V7.1 MR2 for Linux64 : 7.1.3.50324 (2021-04-26T09:47+09)
/ModDate: D:20220517104004-04'00'
/Producer: Antenna House PDF Output Library 7.1.1639
/Title: Designing Machine Learning Systems
/EBX_PUBLISHER: O'Reilly Media, Incorporated


## Using PyMuPDF

In [9]:
import fitz  # PyMuPDF

def extract_pdf_metadata(pdf_path):
    # Open the PDF file
    document = fitz.open(pdf_path)
    
    # Extract document information (metadata)
    metadata = document.metadata
    
    # Close the document
    document.close()

    return metadata

# Example usage
pdf_path = r"D:\Buku Pembelajaran Artificial Intelligence\Chip Huyen - Designing Machine Learning Systems_ An Iterative Process for Production-Ready Applications-O'Reilly Media (2022).pdf"
metadata = extract_pdf_metadata(pdf_path)
for key, value in metadata.items():
    print(f'{key}: {value}')


format: PDF 1.6
title: Designing Machine Learning Systems
author: Huyen, Chip;
subject: 
keywords: 
creator: AH CSS Formatter V7.1 MR2 for Linux64 : 7.1.3.50324 (2021-04-26T09:47+09)
producer: Antenna House PDF Output Library 7.1.1639
creationDate: D:20220517135306Z
modDate: D:20220517104004-04'00'
trapped: 
encryption: None


## PDFMiner

In [11]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

def extract_pdf_metadata(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        parser = PDFParser(file)
        document = PDFDocument(parser)
        
        # Extract document information (metadata)
        metadata = document.info[0]  # List of metadata dictionaries, usually containing one dictionary

        # Convert metadata to a regular dictionary and decode bytes to strings
        metadata_dict = {key: value.decode('utf-8') if isinstance(value, bytes) else value 
                         for key, value in metadata.items()}

    return metadata_dict

# Example usage
pdf_path = r"D:\Buku Pembelajaran Artificial Intelligence\Chip Huyen - Designing Machine Learning Systems_ An Iterative Process for Production-Ready Applications-O'Reilly Media (2022).pdf"
metadata = extract_pdf_metadata(pdf_path)
for key, value in metadata.items():
    print(f'{key}: {value}')


Author: Huyen, Chip;
CreationDate: D:20220517135306Z
Creator: AH CSS Formatter V7.1 MR2 for Linux64 : 7.1.3.50324 (2021-04-26T09:47+09)
ModDate: D:20220517104004-04'00'
Producer: Antenna House PDF Output Library 7.1.1639
Title: Designing Machine Learning Systems
EBX_PUBLISHER: O'Reilly Media, Incorporated


## Parse to Dictionary

In [14]:
import re

text = """
1. Introduction Machine Learning 
   * pengenalan konsep machine learning
   * Pembelajaran lanjut tentang machine learning
2. Topic 2
   * Sub topic 1 from topic 2
   * Sub topic 2 from topic 2
3. Topic 3
   * Sub topic 1 from topic 3
   * Sub topic 2 from topic 3
4. Topic 4
   * sub topik 4   
"""

def parse_text_to_dict(text):
    topics = {}
    lines = text.strip().split('\n')
    current_topic = None
    
    topic_pattern = re.compile(r'^\d+\.\s+(.*)$')
    sub_topic_pattern = re.compile(r'^\*\s+(.*)$')

    for line in lines:
        line = line.strip()
        if topic_pattern.match(line):
            current_topic = topic_pattern.match(line).group(1)
            topics[current_topic] = []
        elif sub_topic_pattern.match(line):
            sub_topic = sub_topic_pattern.match(line).group(1)
            if current_topic:
                topics[current_topic].append(sub_topic)
    
    return topics

topics = parse_text_to_dict(text)
print(topics)

{'Introduction Machine Learning': ['pengenalan konsep machine learning', 'Pembelajaran lanjut tentang machine learning'], 'Topic 2': ['Sub topic 1 from topic 2', 'Sub topic 2 from topic 2'], 'Topic 3': ['Sub topic 1 from topic 3', 'Sub topic 2 from topic 3'], 'Topic 4': ['sub topik 4']}


## Crew AI PDF Search

In [2]:
import os
from crewai import Agent, Task, Crew, Process
from crewai_tools import PDFSearchTool

# Set up environment variables (Replace with your actual keys)
# os.environ["SERPER_API_KEY"] = "Your Serper API Key"
# os.environ["OPENAI_API_KEY"] = "Your OpenAI API Key"

from crewai_tools import PDFSearchTool



# Initialize the tool with a specific PDF path for exclusive search within that document
tool = PDFSearchTool(pdf=r'D:\Project Multimedika\Documents-Summarizer-and-Question-Makers\resources\file.pdf')

# Create the agent
pdf_researcher = Agent(
    role='PDF Researcher',
    goal='Extract specific information from a PDF document.',
    verbose=True,
    memory=True,
    backstory=(
        "You have a keen eye for detail and excel at finding specific information "
        "within lengthy documents. Your expertise is in navigating and extracting "
        "relevant data from PDFs."
    ),
    tools=[tool]
)

# Create the task
pdf_search_task = Task(
    description=(
        "Search for the term 'AI advancements' within the provided PDF document. "
        "Extract and summarize the sections where this term appears. "
        "Your final output should be a concise summary of these sections."
    ),
    expected_output='A summary of the sections containing the term "AI advancements".',
    tools=[tool],
    agent=pdf_researcher
)

# Form the crew
crew = Crew(
    agents=[pdf_researcher],
    tasks=[pdf_search_task],
    process=Process.sequential
)

# Kick off the process
result = crew.kickoff()
print(result)


Inserting batches in chromadb: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it]




[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mTo accomplish the task of searching for the term 'AI advancements' within the provided PDF document, I need to use the "Search a PDF's content" tool. My next step is to perform the search and review the results.

Action: Search a PDF's content
Action Input: {"query": "AI advancements"}[0m[95m 

Relevant Content:
real world , Deloitte, 2017. 4 The state of AI in 2020 , McKinsey, 2020.

products and technologies.

Executive summary Across industries, DevOps and DataOps have been widely adopted as methodologies to improve quality and re - duce the time to market of software engineering and data engineering initiatives. With the rapid growth in machine learning (ML) systems, similar approaches need to be developed in the context of ML engineering, which handle the unique complexities of the practical applications of ML. This is the domain of MLOps. MLOps is a set of standard - ized processes and technology capabilities for 