# JP Moragan Data Extractor

## Install Libraries and Imports

In [1]:
!pip install PyPDF2==3.0.1

Collecting PyPDF2==3.0.1
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m153.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
import pandas as pd
import re
import PyPDF2

## Helper Functions

Helper function to extract text from the pdf

In [3]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF: {e}")
    return text

Helper function to parse the Q&A section into a pandas dataframe with the columns:

- question_answer_group_id: The questions and answers are grouped together for a specific analyst, so the group id is the identifier of the question and answer group.
- speaker: The name of the speaker
- role: The role of the speaker, e.g. Analyst, Cheif Executive
- content: The content spoken

In [4]:
def parse_q_and_a_section(full_text):
    """
    Parses a Q&A transcript into a list of speaker, role, and text dictionaries,
    assuming speaker name is line 1, role is line 2, and text is subsequent lines.
    Handles the special 'Operator' case.

    Args:
        full_text (str): The complete transcript text.

    Returns:
        pd.DataFrame: A DataFrame with 'question_answer_group_id', 'speaker', 'role', and 'content' columns.
    """
    entries = []

    # Add an implicit separator at the end to ensure the last block is captured
    processed_text = re.sub(r"^\s*QUESTION AND ANSWER SECTION\s*\n+", "", full_text, flags=re.MULTILINE).strip()
    processed_text += "\n........................................................................................"

    # Define the separator pattern (newline, dots, newline)
    separator_regex = r"\.{10,}[\s\w\n]?"

    # Split the text into blocks using the separator
    blocks = re.split(separator_regex, processed_text)

    # Filter out any empty blocks that might result from splitting
    blocks = [block.strip() for block in blocks if block.strip()]
    # Initialise the question group index
    question_group_index = 0

    for block in blocks:
        lines = block.split('\n')
        lines = [line.strip() for line in lines if line.strip()] # Clean empty lines within the block
        lines = [line for line in lines if not line.isdigit()] # Clean lines with page number

        speaker_name = "N/A"
        role_name = "N/A"
        text_content = ""

        if not lines:
            continue # Skip empty blocks

        # Handle the Operator case: Speaker and start of text are on the first line
        if lines[0].startswith("Operator :"):
            question_group_index = question_group_index + 1
            continue

        else: # Standard speaker: Name on line 1, Role on line 2, Text after
            if len(lines) >= 1:
                speaker_name = lines[0]
            if len(lines) >= 2:
                role_name = lines[1]
                # Remove optional Q/A from role
                role_name = re.sub(r'\s*(Q|A)$', '', role_name).strip()
            if len(lines) > 2:
                text_content = "\n".join(lines[2:])

        # Final cleanup for text content (e.g., removing any leading/trailing blank lines)
        text_content = text_content.strip()

        entries.append({
            "question_answer_group_id": question_group_index,
            "speaker": speaker_name,
            "role": role_name,
            "content": text_content
        })

    df = pd.DataFrame(entries)
    return df

In [20]:
def parse_transcript_to_dataframes(raw_text):
    """Parses a raw transcript text into a DataFrame of speaking turns."""

    # Initial Cleaning
    # Remove source tags
    text = re.sub(r'\\', '', raw_text)
    # Condense multiple newlines
    text = re.sub(r'\n{2,}', '\n', text).strip()

    # Section Segmentation
    sections = re.split(r'(MANAGEMENT DISCUSSION SECTION|QUESTION AND ANSWER SECTION)', text)
    parsed_turns = []
    current_section_name = "INTRO" # Default for anything before first section header

    # Get the year and quarter
    year_and_quarter = sections[0][:4].split('Q')
    year = year_and_quarter[1]
    quarter = year_and_quarter[0]

    df_q_and_a = pd.DataFrame()
    df_presentation = pd.DataFrame()

    # Iterate through the sections and their content
    for i, part in enumerate(sections):
        # skip if the part is empty
        if not part.strip():
            continue

        # Store the section name then move onto the content of the section
        if part in ["MANAGEMENT DISCUSSION SECTION", "QUESTION AND ANSWER SECTION"]:
            current_section_name = part
            continue

        # If we're looking at the Q&A section, then parse to the Q&A dataframe
        if current_section_name == 'QUESTION AND ANSWER SECTION':
            df_q_and_a = parse_q_and_a_section(part)
            df_q_and_a['quarter'] = quarter
            df_q_and_a['year'] = year

    return df_presentation, df_q_and_a

## Main Extraction Code

In [21]:
pdf_text = extract_text_from_pdf('/content/q4_2024.pdf')
df_presentation, df_q_and_a = parse_transcript_to_dataframes(pdf_text)

In [22]:
print('Question and Answer Dataframe')
display(df_q_and_a.head(20))

Question and Answer Dataframe


Unnamed: 0,question_answer_group_id,speaker,role,content,quarter,year
0,1,John McDonald,"Analyst, Truist Securities, Inc.","Hi. Good morning. Jeremy, I wanted to ask abou...",4,24
1,1,Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Yeah. Good question, John, and welcome back, b...",4,24
2,1,John McDonald,"Analyst, Truist Securities, Inc.",Thank you.,4,24
3,1,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorganChase","Welcome back, John. Read your piece the other ...",4,24
4,1,John McDonald,"Analyst, Truist Securities, Inc",Thanks.,4,24
5,1,Jeremy Barnum,"Chief Financial Officer, JPMorganChase","So, yeah, you've noted all the points that we ...",4,24
6,1,John McDonald,"Analyst, Truist Securities, Inc","Okay. Thanks, Jeremy. And then just as a follo...",4,24
7,1,Jeremy Barnum,"Chief Financial Officer, JPMorganChase",Sure. The truth is – and I guess this is a goo...,4,24
8,1,John McDonald,"Analyst, Truist Securities, Inc.",Very helpful. Thank you.,4,24
9,1,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorganChase",Efficiently said.,4,24


# TEST AREA

In [23]:
import pandas as pd
import re

text = """
QUESTION AND ANSWER SECTION

Operator : Thank you. Please stand by. Our first question comes from John McDonald with Truist Securities. You may proceed.
........................................................................................................................................................................................................................................................................................
John McDonald
Analyst, Truist Securities, Inc. Q
Hi. Good morning. Jeremy, I wanted to ask about capital, and I know you get this question a lot about the kind of high-class dilemma of your
growing capital base and your perspective of that as earnings in store. So, I guess, what's the framework for thinking about the opportunity
cost of sitting on the growing base of capital and how high you might let that go versus your patience in waiting for more attractive deployment
opportunities?
........................................................................................................................................................................................................................................................................................
Jeremy Barnum
Chief Financial Officer, JPMorganChase  A
Yeah. Good question, John, and welcome back, by the way.
........................................................................................................................................................................................................................................................................................
John McDonald
Analyst, Truist Securities, Inc.  Q
Thank you.
........................................................................................................................................................................................................................................................................................
Jamie Dimon
Chairman & Chief Executive Officer, JPMorganChase A
Welcome back, John. Read your piece the other day. It took me quite a while, but it was good work.
........................................................................................................................................................................................................................................................................................

  4
John McDonald
Analyst, Truist Securities, Inc  Q
Thanks.
"""

def parse_q_and_a(full_text):
    """
    Parses a Q&A transcript into a list of speaker, role, and text dictionaries,
    assuming speaker name is line 1, role is line 2, and text is subsequent lines.
    Handles the special 'Operator' case.

    Args:
        full_text (str): The complete transcript text.

    Returns:
        pd.DataFrame: A DataFrame with 'speaker', 'role', and 'text' columns.
    """
    entries = []

    # Add an implicit separator at the end to ensure the last block is captured
    processed_text = re.sub(r"^\s*QUESTION AND ANSWER SECTION\s*\n+", "", full_text, flags=re.MULTILINE).strip()
    processed_text += "\n........................................................................................"

    # Define the separator pattern (newline, dots, newline)
    separator_regex = r"\.{10,}[\s\w\n]?"

    # Split the text into blocks using the separator
    blocks = re.split(separator_regex, processed_text)

    # Filter out any empty blocks that might result from splitting
    blocks = [block.strip() for block in blocks if block.strip()]
    question_number = 0

    for block in blocks:
        lines = block.split('\n')
        lines = [line.strip() for line in lines if line.strip()] # Clean empty lines within the block
        lines = [line for line in lines if not line.isdigit()]

        speaker_name = "N/A"
        role_name = "N/A"
        text_content = ""

        if not lines:
            continue # Skip empty blocks

        # Handle the Operator case: Speaker and start of text are on the first line
        if lines[0].startswith("Operator :"):
            question_number = question_number + 1
            continue

        else: # Standard speaker: Name on line 1, Role on line 2, Text after
            if len(lines) >= 1:
                speaker_name = lines[0]
            if len(lines) >= 2:
                role_name = lines[1]
                # Remove optional Q/A from role
                role_name = re.sub(r'\s*(Q|A)$', '', role_name).strip()
            if len(lines) > 2:
                text_content = "\n".join(lines[2:])

        # Final cleanup for text content (e.g., removing any leading/trailing blank lines)
        text_content = text_content.strip()

        entries.append({
            "question_answer_group_id": question_number,
            "speaker": speaker_name,
            "role": role_name,
            "content": text_content
        })

    df = pd.DataFrame(entries)
    return df

# Run the parsing
parse_q_and_a(text)

Unnamed: 0,question_answer_group_id,speaker,role,content
0,1,John McDonald,"Analyst, Truist Securities, Inc.","Hi. Good morning. Jeremy, I wanted to ask abou..."
1,1,Jeremy Barnum,"Chief Financial Officer, JPMorganChase","Yeah. Good question, John, and welcome back, b..."
2,1,John McDonald,"Analyst, Truist Securities, Inc.",Thank you.
3,1,Jamie Dimon,"Chairman & Chief Executive Officer, JPMorganChase","Welcome back, John. Read your piece the other ..."
4,1,John McDonald,"Analyst, Truist Securities, Inc",Thanks.
