# Setup, Constants, and Imports

In [1]:
import os
import sys
import logging

## Notebook Configs

In [2]:
IS_COLAB = False # Update this if running in Google Colab
OUTPUT_PROCESSED_FILES = True

## Constants

In [3]:
REPO_URL = "https://github.com/EErlando/Quarterly-Bytes.git"
REPO_NAME = "src" 
REPO_BRANCH = "pdf_extraction_and_restructuring" # CHANGE THIS IF NEED
ALL_TRANSCRIPTS_PATH = "data/raw/JP Morgan/Transcripts"
NOTEBOOK_DIR = "1_data_extraction_and_processing" # Update this to your notebook directory

## Clone and Pull Latest from Repository - Colab Specific

In [4]:
if IS_COLAB:
    !git config pull.rebase false
    if os.path.exists(REPO_NAME):
        print(f"Directory '{REPO_NAME}' already exists. Pulling latest changes...")
        %cd {REPO_NAME}
        !git pull origin {REPO_BRANCH} --quiet
        %cd ..
    else:
        print(f"Cloning repository into '{REPO_NAME}'...")
        !git clone --quiet --branch {REPO_BRANCH} {REPO_URL} {REPO_NAME}
        print("Clone complete.")

    sys.path.append('/content/src/')
    %cd /content/src/
    !pip install -r requirements.txt
else:
    if os.path.basename(os.getcwd()) == NOTEBOOK_DIR:
        os.chdir('../../')

logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')

## Local Imports

In [5]:
from src.utils.pdf_utils import extract_transcripts_pdf_df_from_dir, BankType

# Get Transcript Files

In [6]:
qna_df, discussion_df = extract_transcripts_pdf_df_from_dir(ALL_TRANSCRIPTS_PATH, BankType.JPMORGAN)

In [14]:
qna_df.head()

Unnamed: 0,question_order,question_answer_group_id,speaker,role,company,content,year,quarter
0,0,0,John E. McDonald,Analyst,Autonomous Research,"Thank you. Morning, Jeremy. Was wondering abou...",2022,1
1,1,0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Good morning, John. Good question. Yeah, look,...",2022,1
2,2,0,John E. McDonald,Analyst,Autonomous Research,"Okay. And as my follow up, could you give us s...",2022,1
3,3,0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,Yeah. I guess I would direct you to my comment...,2022,1
4,4,0,John E. McDonald,Analyst,Autonomous Research,Okay. Thanks.,2022,1


In [8]:
qna_df.year.value_counts()

year
2022    375
2023    343
2024    280
2025    108
Name: count, dtype: int64

In [9]:
discussion_df.head()

Unnamed: 0,speaker,role,company,content,year,quarter
0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Thanks, operator. Good morning, everyone. The ...",2022,1
1,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Thanks, operator. Good morning, everyone. The ...",2022,2
2,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Thank you very much. Good morning, everyone. A...",2022,3
3,Jamie Dimon,"Chairman, Chief Executive Officer",JPMorgan Chase & Co.,"Yeah, Jeremy, thank you very much. Hello, ever...",2022,3
4,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Yeah. Thanks, Jamie. Let's go ahead and open u...",2022,3


In [10]:
if OUTPUT_PROCESSED_FILES:
    qna_df.to_csv('data/processed/JP Morgan/qna_df.csv', index=False)
    discussion_df.to_csv('data/processed/JP Morgan/discussion_df.csv', index=False)
    print("Processed files saved successfully.")

Processed files saved successfully.
