# Setup, Constants, and Imports

In [1]:
import os
import sys
import logging

## Notebook Configs

In [7]:
IS_COLAB = False # Update this if running in Google Colab
OUTPUT_PROCESSED_FILES = True

## Constants

In [None]:
REPO_URL = "https://github.com/EErlando/Quarterly-Bytes.git"
REPO_NAME = "src" 
REPO_BRANCH = "KEW_jpmorgan_eda" # CHANGE THIS IF NEED
ALL_TRANSCRIPTS_PATH = "data/raw/JP Morgan/Transcripts"
NOTEBOOK_DIR = "1_data_extraction_and_processing" # Update this to your notebook directory

## Clone and Pull Latest from Repository - Colab Specific

In [11]:
if IS_COLAB:
    !git config pull.rebase false
    if os.path.exists(REPO_NAME):
        print(f"Directory '{REPO_NAME}' already exists. Pulling latest changes...")
        %cd {REPO_NAME}
        !git pull origin {REPO_BRANCH} --quiet
        %cd ..
    else:
        print(f"Cloning repository into '{REPO_NAME}'...")
        !git clone --quiet --branch {REPO_BRANCH} {REPO_URL} {REPO_NAME}
        print("Clone complete.")

    sys.path.append('/content/src/')
    %cd /content/src/
    %pip install -r requirements.txt
else:
    %pip install -r requirements.txt
    if os.path.basename(os.getcwd()) == NOTEBOOK_DIR:
        os.chdir('../../')

logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')

Collecting PyPDF2==3.0.1 (from -r requirements.txt (line 1))
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pandas>=2.0.0 (from -r requirements.txt (line 2))
  Downloading pandas-2.3.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting scikit-learn>=1.0.0 (from -r requirements.txt (line 3))
  Downloading scikit_learn-1.7.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting nltk>=3.0.0 (from -r requirements.txt (line 4))
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting spacy>=3.0.0 (from -r requirements.txt (line 5))
  Downloading spacy-3.8.7-cp313-cp313-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting matplotlib (from -r requirements.txt (line 6))
  Downloading matplotlib-3.10.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting seaborn (from -r requirements.txt (line 7))
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting PyYAML (from -r requirements.txt (line 8))
  Downloading 

## Local Imports

In [24]:
from src.utils.pdf_utils import extract_transcripts_pdf_df_from_dir, BankType

# Get Transcript Files

In [25]:
qna_df, discussion_df = extract_transcripts_pdf_df_from_dir(ALL_TRANSCRIPTS_PATH, BankType.JPMORGAN)

In [17]:
qna_df.head(10)

Unnamed: 0,question_order,question_answer_group_id,speaker,role,company,content,year,quarter
0,0,0,John E. McDonald,Analyst,Autonomous Research,"Thank you. Morning, Jeremy. Was wondering abou...",2022,1
1,1,0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Good morning, John. Good question. Yeah, look,...",2022,1
2,2,0,John E. McDonald,Analyst,Autonomous Research,"Okay. And as my follow up, could you give us s...",2022,1
3,3,0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,Yeah. I guess I would direct you to my comment...,2022,1
4,4,0,John E. McDonald,Analyst,Autonomous Research,Okay. Thanks.,2022,1
5,5,0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Thanks, John .",2022,1
6,6,0,Ken Usdin,Analyst,Jefferies LLC,"Hi. Thanks. Good morning. Jeremy, just wanted ...",2022,1
7,7,0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,Yeah. Thanks. So let me just give some high le...,2022,1
8,8,0,Ken Usdin,Analyst,Jefferies LLC,"And just a follow up there too, is there anyth...",2022,1
9,9,0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Yeah. I think – I guess in general, we haven't...",2022,1


In [15]:
qna_df.year.value_counts()

year
2022    375
2023    343
2024    280
2025    108
Name: count, dtype: int64

In [20]:
qna_df.role.value_counts()

role
Analyst                               504
Chief Financial Officer               366
Chairman & Chief Executive Officer    236
Name: count, dtype: int64

In [None]:
discussion_df.head()

Unnamed: 0,speaker,role,company,content,year,quarter
0,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Thanks, operator. Good morning, everyone. The ...",2022,1
1,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Thanks, operator. Good morning, everyone. The ...",2022,2
2,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Thank you very much. Good morning, everyone. A...",2022,3
3,Jamie Dimon,"Chairman, Chief Executive Officer",JPMorgan Chase & Co.,"Yeah, Jeremy, thank you very much. Hello, ever...",2022,3
4,Jeremy Barnum,Chief Financial Officer,JPMorgan Chase & Co.,"Yeah. Thanks, Jamie. Let's go ahead and open u...",2022,3


In [None]:
if OUTPUT_PROCESSED_FILES:
    qna_df.to_csv('data/processed/JP Morgan/qna_df.csv', index=False)
    discussion_df.to_csv('data/processed/JP Morgan/discussion_df.csv', index=False)
    print("Processed files saved successfully.")

Processed files saved successfully.
