# Setup, Constants, and Imports

In [1]:
import os
import sys
import logging

## Notebook Configs

In [7]:
IS_COLAB = True # Update this if running in Google Colab
OUTPUT_PROCESSED_FILES = True

## Constants

In [8]:
REPO_URL = "https://github.com/EErlando/Quarterly-Bytes.git"
REPO_NAME = "src"
REPO_BRANCH = "pdf_extraction_and_restructuring" # CHANGE THIS IF NEED
ALL_TRANSCRIPTS_PATH = "data/raw/Goldman Sachs/Transcripts"
NOTEBOOK_DIR = "1_data_extraction_and_processing" # Update this to your notebook directory

## Clone and Pull Latest from Repository - Colab Specific

In [9]:
if IS_COLAB:
    !git config pull.rebase false
    if os.path.exists(REPO_NAME):
        print(f"Directory '{REPO_NAME}' already exists. Pulling latest changes...")
        %cd {REPO_NAME}
        !git pull origin {REPO_BRANCH} --quiet
        %cd ..
    else:
        print(f"Cloning repository into '{REPO_NAME}'...")
        !git clone --quiet --branch {REPO_BRANCH} {REPO_URL} {REPO_NAME}
        print("Clone complete.")

    sys.path.append('/content/src/')
    %cd /content/src/
    !pip install -r requirements.txt
else:
    if os.path.basename(os.getcwd()) == NOTEBOOK_DIR:
        os.chdir('../../')

logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')

fatal: not in a git directory
Cloning repository into 'src'...
Clone complete.
/content/src
Collecting PyPDF2==3.0.1 (from -r requirements.txt (line 1))
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


## Local Imports

In [10]:
from src.utils.pdf_utils import extract_transcripts_pdf_df_from_dir
from src.constants import BankType

In [12]:
BankType.GOLDMAN_SACHS.value

'Goldman Sachs'

# Get Transcript Files

In [11]:
qna_df, discussion_df = extract_transcripts_pdf_df_from_dir(ALL_TRANSCRIPTS_PATH, BankType.GOLDMAN_SACHS)



In [None]:
qna_df.head()

Unnamed: 0,question_order,question_answer_group_id,speaker,role,company,content_type,content,quarter,year
0,0,0,Glenn Schorr,,Evercore,question,"so, trading question, i mean, markets busines...",3,2024
1,1,0,David Solomon,"Chairman, Chief Executive Ofﬁcer",Goldman Sachs,answer,"i appreciate the question, glenn, and i mean,...",3,2024
2,2,0,Glenn Schorr,,Evercore,question,i appreciate that. this one will be a short f...,3,2024
3,3,0,Denis Coleman,Chief Financial Ofﬁcer,Goldman Sachs,answer,"glenn, its denis. i guess what i would sugges...",3,2024
4,0,1,Ebrahim Poonawala,,Bank of America,question,i just had a follow-up ﬁrst on trading and ma...,3,2024


In [None]:
discussion_df.head()

Unnamed: 0,speaker,role,company,content,quarter,year
0,David Solomon,"Chairman, Chief Executive Ofﬁcer",Goldman Sachs,"Thank you, operator , good morning, everyone....",3,2024
1,Denis Coleman,Chief Financial Ofﬁcer,Goldman Sachs,"Thank you, David. Good morning. Let's start w...",3,2024
2,David Solomon,"CEO, Chairman",Goldman Sachs,"Thank you, operator , and good morning, every...",4,2024
3,Denis Coleman,CFO,Goldman Sachs,"Thank you, David, and good morning. Let's sta...",4,2024
4,Carey Halio,"Chief Strategy Ofﬁcer, Head of Investor Relations",Goldman Sachs,"Thank you. Good morning. This is Carey Halio,...",2,2023


In [None]:
if OUTPUT_PROCESSED_FILES:
    qna_df.to_csv('data/processed/Goldman Sachs/qna_df.csv', index=False)
    discussion_df.to_csv('data/processed/Goldman Sachs/discussion_df.csv', index=False)
    print("Processed files saved successfully.")

Processed files saved successfully.
