In [1]:
!pip install crewai pypdf langchain langchain_community openai

Collecting crewai
  Downloading crewai-0.108.0-py3-none-any.whl.metadata (33 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting appdirs>=1.4.4 (from crewai)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting auth0-python>=4.7.1 (from crewai)
  Downloading auth0_python-4.9.0-py3-none-any.whl.metadata (9.0 kB)
Collecting chromadb>=0.5.23 (from crewai)
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting instructor>=1.3.3 (from crewai)
  Downloading instructor-1.7.8-py3-none-any.whl.metadata (22 kB)
Collecting json-repair>=0.25.2 (from crewai)
  Downloading json_repair-0.40.0-py3-none-any.whl.metadata (11 kB)
Collecting json5>=0.10.0 (from crewai)
  Downloading json5-0.11.0-py3-none-any.whl.metadata (35 kB)
Collecting jsonref>=1.1.0 (from crewai)
  Downloading jsonref-1.1.0-py3-none-any.whl.met

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [3]:
from crewai import Agent, Task, Crew, Process, LLM
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [4]:
# === Load PDF and Split ===
def load_pdf_content(path):
    loader = PyPDFLoader(path)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(docs)
    return "\n\n".join([c.page_content for c in chunks])

In [5]:
# == PDF Loading ===
pdf_text = load_pdf_content("/content/PDF_JSON.pdf")



In [6]:
# Setup LLM
openai_llm = LLM(
    model="gpt-4o"
)

In [7]:
# === Define Agent ===
json_agent = Agent(
    role="PDF Information Extractor",
    goal="Convert PDF data into structured JSON format",
    backstory="You're an expert in extracting structured data from messy documents.",
    llm=openai_llm,
    verbose=True
)

In [8]:
# === Define Task ===
json_task = Task(
    description=f"""Extract structured data from this text and return a valid JSON.
Text:
{pdf_text[:4000]}

Output must be:
- Strict JSON (no markdown, no backticks)
- Format based on content (e.g., name, date, department if available)
""",
    expected_output="Valid JSON structure with key-value pairs from the text.",
    agent=json_agent
)

In [9]:
# === Define Crew ===
json_crew = Crew(
    agents=[json_agent],
    tasks=[json_task],
    process=Process.sequential,
    verbose=True
)

In [10]:
# === Run Agent ===
json_result = json_crew.kickoff()

[1m[95m# Agent:[00m [1m[92mPDF Information Extractor[00m
[95m## Task:[00m [92mExtract structured data from this text and return a valid JSON.
Text:
Employee  Name:  Alice  Johnson    Position:  Software  Engineer    Joining  Date:  January  2021    Department:  Engineering     Employee  Name:  Brian  Smith    Position:  HR  Specialist    Joining  Date:  March  2020    Department:  Human  Resources     Employee  Name:  Catherine  Lee    Position:  Financial  Analyst    Joining  Date:  May  2019    Department:  Finance     Employee  Name:  David  Wright    Position:  Product  Manager    Joining  Date:  June  2022    Department:  Product     Employee  Name:  Emily  Davis    Position:  UX  Designer    Joining  Date:  February  2023    Department:  Design     Employee  Name:  Faisal  Khan    Position:  Data  Scientist    Joining  Date:  August  2021    Department:  AI  &  Analytics     Employee  Name:  Grace  Thomas    Position:  Legal  Advisor    Joining  Date:  October  2020    D



[1m[95m# Agent:[00m [1m[92mPDF Information Extractor[00m
[95m## Final Answer:[00m [92m
{
  "employees": [
    {
      "name": "Alice Johnson",
      "position": "Software Engineer",
      "joining_date": "January 2021",
      "department": "Engineering"
    },
    {
      "name": "Brian Smith",
      "position": "HR Specialist",
      "joining_date": "March 2020",
      "department": "Human Resources"
    },
    {
      "name": "Catherine Lee",
      "position": "Financial Analyst",
      "joining_date": "May 2019",
      "department": "Finance"
    },
    {
      "name": "David Wright",
      "position": "Product Manager",
      "joining_date": "June 2022",
      "department": "Product"
    },
    {
      "name": "Emily Davis",
      "position": "UX Designer",
      "joining_date": "February 2023",
      "department": "Design"
    },
    {
      "name": "Faisal Khan",
      "position": "Data Scientist",
      "joining_date": "August 2021",
      "department": "AI & Analytic

In [11]:
print("\n JSON Output:\n", json_result)


 JSON Output:
 {
  "employees": [
    {
      "name": "Alice Johnson",
      "position": "Software Engineer",
      "joining_date": "January 2021",
      "department": "Engineering"
    },
    {
      "name": "Brian Smith",
      "position": "HR Specialist",
      "joining_date": "March 2020",
      "department": "Human Resources"
    },
    {
      "name": "Catherine Lee",
      "position": "Financial Analyst",
      "joining_date": "May 2019",
      "department": "Finance"
    },
    {
      "name": "David Wright",
      "position": "Product Manager",
      "joining_date": "June 2022",
      "department": "Product"
    },
    {
      "name": "Emily Davis",
      "position": "UX Designer",
      "joining_date": "February 2023",
      "department": "Design"
    },
    {
      "name": "Faisal Khan",
      "position": "Data Scientist",
      "joining_date": "August 2021",
      "department": "AI & Analytics"
    },
    {
      "name": "Grace Thomas",
      "position": "Legal Advisor",
 