# Setup, Constants, and Imports

In [10]:
import os
import sys
import logging

## Notebook Configs

In [11]:
IS_COLAB = False # Update this if running in Google Colab
OUTPUT_PROCESSED_FILES = True

## Constants

In [12]:
REPO_URL = "https://github.com/EErlando/Quarterly-Bytes.git"
REPO_NAME = "src" 
ALL_TRANSCRIPTS_PATH = "data/raw/Goldman Sachs/Transcripts"
NOTEBOOK_DIR = "1_data_extraction_and_processing" # Update this to your notebook directory

## Clone and Pull Latest from Repository - Colab Specific

In [13]:

if IS_COLAB:
    if os.path.exists(REPO_NAME):
        print(f"Directory '{REPO_NAME}' already exists. Pulling latest changes...")
        %cd {REPO_NAME}
        !git pull --quiet
        %cd ..
    else:
        print(f"Cloning repository into '{REPO_NAME}'...")
        !git clone --quiet {REPO_URL} {REPO_NAME}
        print("Clone complete.")

    sys.path.append('/content/src/')
else:
    if os.path.basename(os.getcwd()) == NOTEBOOK_DIR:
        os.chdir('../../')

logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')

## Local Imports

In [14]:
from src.utils.pdf_utils import extract_transcripts_pdf_df_from_dir, BankType

# Get Transcript Files

In [15]:
qna_df, discussion_df = extract_transcripts_pdf_df_from_dir(ALL_TRANSCRIPTS_PATH, BankType.GOLDMAN_SACHS)

In [16]:
qna_df.head()

Unnamed: 0,question_order,question_answer_group_id,speaker,role,content_type,content,quarter,year
0,0,0,Glenn Schorr,question,question,"so, trading question, i mean, markets busines...",3,2024
1,1,0,David Solomon,answer,answer,"i appreciate the question, glenn, and i mean,...",3,2024
2,2,0,Glenn Schorr,question,question,i appreciate that. this one will be a short f...,3,2024
3,3,0,Denis Coleman,answer,answer,"glenn, its denis. i guess what i would sugges...",3,2024
4,0,1,Ebrahim Poonawala,question,question,i just had a follow-up ﬁrst on trading and ma...,3,2024


In [17]:
discussion_df.head()

Unnamed: 0,speaker,role,content,quarter,year
0,David Solomon,"[Chairman, Chief Executive Ofﬁcer]","Thank you, operator , good morning, everyone....",3,2024
1,Denis Coleman,[Chief Financial Ofﬁcer],"Thank you, David. Good morning. Let's start w...",3,2024
2,David Solomon,"[Chairman, CEO]","Thank you, operator , and good morning, every...",4,2024
3,Denis Coleman,[CFO],"Thank you, David, and good morning. Let's sta...",4,2024
4,Carey Halio,"[Head of Investor Relations, Chief Strategy Of...","Thank you. Good morning. This is Carey Halio,...",2,2023


In [18]:
if OUTPUT_PROCESSED_FILES:
    qna_df.to_csv('data/processed/Goldman Sachs/qna_df.csv', index=False)
    discussion_df.to_csv('data/processed/Goldman Sachs/discussion_df.csv', index=False)
    print("Processed files saved successfully.")

Processed files saved successfully.
