# Setup, Constants, and Imports

In [42]:
import os
import sys
import logging

## Notebook Configs

In [43]:
IS_COLAB = 'google.colab' in sys.modules
OUTPUT_PROCESSED_FILES = False # TODO: Use this if you want to output save files (optional - see below)

if IS_COLAB:
    from google.colab import userdata
    GITHUB_USERNAME = userdata.get('github_user')
    GITHUB_TOKEN = userdata.get('github_token')
    GITHUB_EMAIL = userdata.get('github_email')

## Constants

In [44]:
REPO_URL = "https://github.com/EErlando/Quarterly-Bytes.git"
REPO_NAME = "src"
REPO_BRANCH = "LP_topic_modelling_extended" # TODO: UPDATE THIS TO YOU BRANCH - DEFAULT TO MAIN
NOTEBOOK_DIR = "3_modelling" # TODO: UPDATE THIS TO YOUR NOTEBOOK DIRECTORY (e.g. 1_data_extraction_and_processing)

## Clone and Pull Latest from Repository - Colab Specific

In [45]:
if IS_COLAB:
    !git config pull.rebase false
    if os.path.exists(REPO_NAME):
        print(f"Directory '{REPO_NAME}' already exists. Pulling latest changes...")
        %cd {REPO_NAME}
        !git pull origin {REPO_BRANCH} --quiet
        %cd ..
    else:
        print(f"Cloning repository into '{REPO_NAME}'...")
        !git clone --quiet --branch {REPO_BRANCH} {REPO_URL} {REPO_NAME}
        print("Clone complete.")

    sys.path.append('/content/src/')
    %cd /content/src/
    !pip install -r requirements.txt
else:
    if os.path.basename(os.getcwd()) == NOTEBOOK_DIR:
        os.chdir('../../') # TODO: UPDATE THIS TO ROOT OF REPO

    !pip install -r requirements.txt

logging.basicConfig(level=logging.ERROR, format='%(levelname)s: %(message)s')

Directory 'src' already exists. Pulling latest changes...
/content/src/src
/content/src
/content/src


## Post Install Imports

In [46]:
import pandas as pd
import numpy as np
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import os
!pip install bertopic

from bertopic import BERTopic




In [47]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Local Imports

In [48]:
from src.utils.common_helpers import read_yaml_file, read_list_from_text_file

## Helpers

In [49]:
def group_df(df, group_by_columns, agg_column='content'):
    """
    Groups the DataFrame by specified columns and aggregates the content column.

    Parameters:
    - df: DataFrame to group
    - group_by_columns: List of columns to group by
    - agg_column: Column to aggregate (default is 'content')

    Returns:
    - Grouped DataFrame with aggregated content
    """
    return df.groupby(group_by_columns, as_index=False).agg({agg_column: ' '.join})


## Load Data

In [50]:
gs_discussion_df = pd.read_csv('data/processed/Goldman Sachs/discussion_df.csv')
gs_qna_df = pd.read_csv('data/processed/Goldman Sachs/qna_df.csv')
jp_discussion_df = pd.read_csv('data/processed/JP Morgan/discussion_df.csv')
jp_qna_df = pd.read_csv('data/processed/JP Morgan/qna_df.csv')


# Goldman Sachs
grouped_gs_discussion_df = group_df(gs_discussion_df, ['quarter', 'year'])
grouped_gs_qna_df = group_df(gs_qna_df, ['question_answer_group_id', 'quarter', 'year'])

# JP Morgan
grouped_jp_discussion_df = group_df(jp_discussion_df, ['quarter', 'year'])
grouped_jp_qna_df = group_df(jp_qna_df, ['question_answer_group_id', 'quarter', 'year'])


# Topic Modelling

In [51]:
gs_stopwords = set(read_list_from_text_file('src/data_processing/goldman_sachs_topic_modelling_stopwords.txt'))
abbreviations = read_yaml_file('src/abbreviations.yaml')

https://arxiv.org/pdf/2504.15683
Use FinTextSim

In [52]:


try:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
except OSError:
    print("SpaCy 'en_core_web_sm' model not found. Please run: python -m spacy download en_core_web_sm")
    exit()

gs_stopwords = nlp.Defaults.stop_words.union(gs_stopwords)

def preprocess_text(text: str, stop_words: set, abbreviations: dict) -> str:
    if not isinstance(text, str):
        return ""

    processed_text = text.lower()
    processed_text = re.sub(r'[-_]+', ' ', processed_text).strip()

    sorted_phrases = sorted(abbreviations.items(), key=lambda item: len(item[1]), reverse=True)

    for abbrev, phrase in sorted_phrases:
        processed_text = re.sub(r'\b' + re.escape(phrase.lower()) + r'\b', abbrev.lower(), processed_text)

    processed_text = re.sub(r'\b\d+\b', '', processed_text).strip()

    doc = nlp(processed_text)

    tokens = []
    for token in doc:
        if token.text not in stop_words or token.text in abbreviations.keys():
            tokens.append(token.lemma_) # Lemmatize the token (abbreviations won't change)

    return " ".join(tokens)


In [58]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
import os
from bertopic import BERTopic
from umap import UMAP
import hdbscan
from sentence_transformers import SentenceTransformer


# --- Mock Preprocessing Function (replace with your actual preprocess_text) ---
# def preprocess_text(text, stop_words=None, abbreviations=None):
#     """
#     A mock preprocessing function to clean and prepare text.
#     In a real application, this would include tokenization, stemming/lemmatization,
#     punctuation removal, number handling, etc.
#     """
#     text = str(text).lower()
#     # Simple tokenization and stop word removal for demonstration
#     words = text.split()
#     if stop_words:
#         words = [word for word in words if word not in stop_words]
#     # Simple abbreviation handling (example: 'ai' -> 'artificial intelligence')
#     if abbreviations:
#         for abbr, full in abbreviations.items():
#             words = [full if word == abbr else word for word in words]
#     return " ".join(words)


def preprocess_text(text: str, stop_words: set, abbreviations: dict) -> str:
    if not isinstance(text, str):
        return ""

    processed_text = text.lower()
    processed_text = re.sub(r'[-_]+', ' ', processed_text).strip()

    sorted_phrases = sorted(abbreviations.items(), key=lambda item: len(item[1]), reverse=True)

    for abbrev, phrase in sorted_phrases:
        processed_text = re.sub(r'\b' + re.escape(phrase.lower()) + r'\b', abbrev.lower(), processed_text)

    processed_text = re.sub(r'\b\d+\b', '', processed_text).strip()

    doc = nlp(processed_text)

    tokens = []
    for token in doc:
        if token.text not in stop_words or token.text in abbreviations.keys():
            tokens.append(token.lemma_) # Lemmatize the token (abbreviations won't change)

    return " ".join(tokens)

# --- Custom Transformer for Text Preprocessing ---
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    A custom scikit-learn transformer to apply text preprocessing.
    It wraps the 'preprocess_text' function.
    """
    def __init__(self, stop_words=None, abbreviations=None):
        self.stop_words = stop_words
        self.abbreviations = abbreviations

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print("Starting Phase 1: Preprocessing...")
        preprocessed_X = [preprocess_text(text, self.stop_words, self.abbreviations) for text in X]
        print("Preprocessing complete.")
        return pd.Series(preprocessed_X)


# --- Custom Estimator for BERTopic Modeling ---
class BERTopicWrapper(BaseEstimator, TransformerMixin):
    """
    A custom scikit-learn estimator that wraps BERTopic.
    """
    def __init__(self, embedding_model='all-MiniLM-L6-v2', umap_args=None, hdbscan_args=None,
                 vectorizer_args=None, nr_topics="auto", calculate_probabilities=True, **bertopic_kwargs):

        self.embedding_model_name = embedding_model
        self.umap_args = umap_args if umap_args is not None else {}
        self.hdbscan_args = hdbscan_args if hdbscan_args is not None else {}
        self.vectorizer_args = vectorizer_args if vectorizer_args is not None else {}
        self.nr_topics = nr_topics
        self.calculate_probabilities = calculate_probabilities
        self.bertopic_kwargs = bertopic_kwargs
        self.bertopic_model = None

    def fit(self, X, y=None):
        print("\nStarting Phase 3: Topic Modeling (BERTopic)...")

        # Initialize UMAP and HDBSCAN models
        umap_model = UMAP(**self.umap_args)
        hdbscan_model = hdbscan.HDBSCAN(
            min_cluster_size=10,  # Default, can be overridden by hdbscan_args
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=True, # Required for transform to assign topics to new data
            **self.hdbscan_args
        )

        # Initialize SentenceTransformer
        embedding_model = SentenceTransformer(self.embedding_model_name)

        default_min_df_for_bertopic_vectorizer = 1 # Changed from 10 to 1 for higher permissiveness

        # Combine default vectorizer args with user-provided args
        combined_vectorizer_args = {
            'min_df': default_min_df_for_bertopic_vectorizer,
            'ngram_range': (1, 3), # Common default for BERTopic's internal vectorizer
            **self.vectorizer_args # User-provided vectorizer_args will override these defaults
        }

        vectorizer_model = TfidfVectorizer(**combined_vectorizer_args)


        self.bertopic_model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            nr_topics=self.nr_topics,
            calculate_probabilities=self.calculate_probabilities,
            **self.bertopic_kwargs
        )

        # X is expected to be a pandas Series of preprocessed text
        self.topics, self.probs = self.bertopic_model.fit_transform(X.tolist())
        print("BERTopic model fitting complete.")
        return self

    def transform(self, X):
        if self.bertopic_model is None:
            raise RuntimeError("BERTopic model not fitted. Call fit() first.")
        print("Transforming data with fitted BERTopic model...")
        topics, probs = self.bertopic_model.transform(X.tolist())
        print("Transformation complete.")
        return topics # Return topic assignments

    def get_model(self):
        return self.bertopic_model

# --- Utility function to display topics (adapted for both LDA and BERTopic) ---
def display_topics(model, vectorizer=None, no_top_words=10, file=None):
    """
    Prints or writes the top words for each topic.
    Args:
        model: The fitted topic model (LDA or BERTopic).
        vectorizer (TfidfVectorizer, optional): The fitted TF-IDF vectorizer (for LDA).
        no_top_words (int): The number of top words to display for each topic.
        file (file object, optional): If provided, topics will be written to this file.
        model_type (str): 'lda' or 'bertopic' to specify model type for appropriate display.
    """
    topic_info = model.get_topic_info()
    output_str = "\nBERTopic - Top Words per Topic:\n"
    if file:
        file.write(output_str)
    else:
        print(output_str)

    # Iterate through all topics, excluding the noise topic (-1)
    for topic_id in topic_info.Topic.unique():
        if topic_id == -1: # Skip noise topic
            continue
        # Get the top words for the current topic
        words = model.get_topic(topic_id)
        if words:
            top_words = ", ".join([word for word, _ in words[:no_top_words]])
            topic_name = topic_info[topic_info['Topic'] == topic_id]['Name'].iloc[0]
            output_str = f"Topic {topic_id} ({topic_name}): {top_words}\n"
            if file:
                file.write(output_str)
            else:
                print(output_str)
        else:
            output_str = f"Topic {topic_id}: No words found.\n"
            if file:
                file.write(output_str)
            else:
                print(output_str)


# --- Main Topic Modeling Pipeline Class ---
class TopicModelingPipeline:
    def __init__(self, model_type='lda', **kwargs):
        """
        Initializes the topic modeling pipeline.

        Args:
            model_type (str): The type of topic model to use ('lda' or 'bertopic').
            **kwargs: Arguments specific to the chosen model or pipeline steps.
                      For LDA: max_df, min_df, ngram_range (for TF-IDF), n_components, max_iter, etc.
                      For BERTopic: embedding_model, umap_args, hdbscan_args, vectorizer_args, nr_topics, etc.
        """
        self.model_type = model_type
        self.pipeline = self._build_pipeline(**kwargs)

    def _build_pipeline(self, **kwargs):
        """Builds the scikit-learn pipeline based on the specified model_type."""
        preprocessor_kwargs = {
            'stop_words': kwargs.pop('stop_words', []),
            'abbreviations': kwargs.pop('abbreviations', {})
        }

        pipeline_steps = [
            ('preprocessor', TextPreprocessor(**preprocessor_kwargs))
        ]

        bertopic_kwargs = {
            'embedding_model': kwargs.pop('embedding_model', 'all-MiniLM-L6-v2'),
            'umap_args': kwargs.pop('umap_args', {}),
            'hdbscan_args': kwargs.pop('hdbscan_args', {}),
            'vectorizer_args': kwargs.pop('vectorizer_args', {}), # Pass custom vectorizer_args here
            'nr_topics': kwargs.pop('nr_topics', "auto"),
            'calculate_probabilities': kwargs.pop('calculate_probabilities', True),
            **kwargs # Pass any remaining kwargs directly to BERTopicWrapper
        }
        pipeline_steps.append(('topic_modeler', BERTopicWrapper(**bertopic_kwargs)))

        # Any remaining kwargs are ignored if not consumed by model-specific initializations
        if kwargs:
            print(f"Warning: Unused keyword arguments passed to pipeline: {kwargs}")

        return Pipeline(pipeline_steps)

    def fit(self, X, y=None):
        """Fits the entire pipeline to the input data."""
        print(f"\n--- Fitting {self.model_type.upper()} Topic Modeling Pipeline ---")
        self.pipeline.fit(X, y)
        return self

    def transform(self, X):
        """Transforms the input data and returns topic assignments/distributions."""
        return self.pipeline.transform(X)

    def get_topic_model(self):
        """Returns the underlying fitted topic model (LDA or BERTopic)."""
        return self.pipeline.named_steps['topic_modeler'].get_model()

    def get_vectorizer(self):
        """Returns the fitted vectorizer (TF-IDF for LDA, None for BERTopic)."""
        if self.model_type == 'lda':
            return self.pipeline.named_steps['tfidf_vectorizer']
        return None


In [63]:
from hdbscan import HDBSCAN

output_dir = "data/temp/leslie_topic_modelling_fine_tuning"
os.makedirs(output_dir, exist_ok=True)
no_top_words = 10
bertopic_pipeline_instance = TopicModelingPipeline(
    embedding_model='all-MiniLM-L6-v2',
    model_type='bertopic',
    nr_topics=8,
    calculate_probabilities=True,
    # umap_args={'n_neighbors': 15, 'n_components': 5},
    vectorizer_args={'min_df': 1},
    stop_words=gs_stopwords,
    abbreviations=abbreviations
)
bertopic_pipeline_instance.fit(grouped_gs_qna_df['content'])
bertopic_model = bertopic_pipeline_instance.get_topic_model()

output_filename_bertopic = f"{output_dir}/bertopic_topics.txt"
with open(output_filename_bertopic, 'w', encoding='utf-8') as f:
    f.write("--- BERTopic Model ---\n\n")
    f.write("Interpreting Topics:\n")
    display_topics(bertopic_model, no_top_words=no_top_words, file=f)
print(f"BERTopic Topics saved to {output_filename_bertopic}")

# Assign dominant topics for BERTopic
bertopic_topic_assignments = bertopic_pipeline_instance.transform(grouped_gs_qna_df['content'])
grouped_gs_qna_df['dominant_topic_bertopic'] = bertopic_topic_assignments



--- Fitting BERTOPIC Topic Modeling Pipeline ---
Starting Phase 1: Preprocessing...
Preprocessing complete.

Starting Phase 3: Topic Modeling (BERTopic)...
BERTopic model fitting complete.
BERTopic Topics saved to data/temp/leslie_topic_modelling_fine_tuning/bertopic_topics.txt
Starting Phase 1: Preprocessing...
Preprocessing complete.
Transforming data with fitted BERTopic model...
Transformation complete.


In [64]:
bertopic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,64,-1_client_thing_term_market,"[client, thing, term, market, growth, ﬁrm, pla...","[get question consumer pivot , break . let add..."
1,0,11,0_term_fee_alternative_growth,"[term, fee, alternative, growth, alt, fundrais...","[, great . , david dennis . couple question aw..."
2,1,74,1_client_market_give_growth,"[client, market, give, growth, thing, business...","[. . , betsy . . make platform solution . ..."


Conclusion and Recommendation
The addition of stopwords has certainly helped in some areas, making certain topics clearer. However, some conversational noise still persists, especially words related to the Q&A format or general conversational patterns. The term "Apple" continues to be grouped with "deposit" in some k values, which is still a bit puzzling without specific context.

Based on this comprehensive analysis, the most sensible k value is a trade-off between granularity, coherence, and minimizing "junk" topics.

k=8: Offers good clarity for key themes (Credit Card, Headcount/Severance, Asset/Fundraising), but is still quite broad and has some remaining conversational noise.
k=9: Introduces very strong "GSIB" and "Wealth Management" topics.
k=10: Shows strong "Investment/Platform" and "Fundraising" themes.
k=11: This k value demonstrates the best balance in this new set of runs.
It produces several very distinct and interpretable financial/business topics: "Wealth Management/European Footprint" , "Severance/Headcount/Capital" , "Credit Card/Consumer" , "GSIB/Allocation" , "Bank/Acquisition/Advisory" , "FICC/Equity/Commodity" , and "Deposit/Capital/Market/Exposure".

Crucially, the "Apple" anomaly is not present in the top words of any topic for k=11, suggesting a cleaner separation of terms.
While some conversational noise is still present (Topics 2, 4, 9 in k=11), the quality of the interpretable topics is high.
k=12, k=13, k=14: Beyond k=11, the topics generally become more fragmented, or reintroduce the "Apple" anomaly, and the number of less coherent/conversational topics increases, making overall interpretation more challenging. For example, k=12 recombines "funding/deposits" with "severance/headcount", which is less ideal.
Therefore, my strongest recommendation is k=11. It provides a good level of detail for key financial aspects of Goldman Sachs' earnings calls while offering significantly improved topic coherence and distinctiveness, and effectively mitigating some of the persistent noise terms seen in other k values.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
topic_labels_map = {
    0: "Strategic Positioning & Platform",
    1: "Wealth Management & European Markets",
    2: "Alternative Investments & Fee Income",
    3: "Headcount & Workforce Management",
    4: "Consumer Credit & Card Performance",
    5: "FICC & Equity Trading Performance",
    6: "Regulatory Capital & Institutional Allocation",
    7: "Client-Centric Growth & Solutions",
    8: "M&A, Valuations & Advisory",
    9: "FICC & Market Environment",
    10: "Deposits, Capital & Funding"
}

# Assign labels to the topics_data
for topic_info in topics_data:
    topic_info['label'] = topic_labels_map.get(topic_info['topic_idx'], f"Unlabeled Topic {topic_info['topic_idx']}")

print("\n--- Topics with assigned labels and top words ---")
for topic_info in topics_data:
    print(f"Topic {topic_info['topic_idx'] + 1}: {topic_info['label']}")
    print(f"  Top Words: {' '.join(topic_info['top_words'])}")
    print("-" * 30)

# --- Visual Display of Topics ---
print("\n--- Generating visual display of topics ---")

n_cols = 3
n_rows = (num_topics + n_cols - 1) // n_cols
plt.figure(figsize=(n_cols * 6, n_rows * 4), dpi=100)

for i, topic_info in enumerate(topics_data):
    ax = plt.subplot(n_rows, n_cols, i + 1)
    df_plot = pd.DataFrame({
        'word': topic_info['top_words'],
        'weight': topic_info['word_weights']
    })
    df_plot = df_plot.sort_values(by='weight', ascending=True)
    sns.barplot(x='weight', y='word', data=df_plot, palette='magma', ax=ax)
    ax.set_title(f"{topic_info['label']}", fontsize=11, fontweight='bold', pad=10)
    ax.set_xlabel("Word Importance (Weight)", fontsize=9)
    ax.set_ylabel("")
    ax.tick_params(axis='both', which='major', labelsize=8)
    sns.despine(ax=ax, top=True, right=True, left=False, bottom=False)
    ax.tick_params(axis='y', length=0)

plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.suptitle(
    f"Top Words for {num_topics} Topics in Goldman Sachs Earnings Calls (Q&A Section)",
    y=1.00, fontsize=16, fontweight='bold'
)
plt.show()

print("\nVisual display generated. Please review the plots and verify the topic labels.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.countplot(
    data=grouped_gs_qna_df,
    x='year',
    hue='quarter',
    palette='tab10'
)
plt.xlabel('Year')
plt.ylabel('Number of Documents')
plt.title('Distribution of Documents by Year and Quarter\nGoldman Sachs Q&A')
plt.legend(title='Quarter')
plt.tight_layout()
plt.show()

# Now, for dominant topics over both year and quarter:
plt.figure(figsize=(16, 7))
sns.countplot(
    data=grouped_gs_qna_df,
    x='dominant_topic_k11',
    hue='year',
    palette='tab10'
)
topic_labels = [topic_labels_map.get(i, f"Topic {i}") for i in sorted(grouped_gs_qna_df['dominant_topic_k11'].unique())]
plt.xticks(ticks=range(len(topic_labels)), labels=topic_labels, rotation=45, ha='right')
plt.xlabel('Dominant Topic (k=11)')
plt.ylabel('Number of Documents')
plt.title('Dominant Topics (k=11) by Year\nGoldman Sachs Q&A')
plt.legend(title='Year')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

years = sorted(grouped_gs_qna_df['year'].unique())
n_years = len(years)

fig, axes = plt.subplots(n_years, 1, figsize=(10, 4 * n_years), sharex=True)

if n_years == 1:
    axes = [axes]

for idx, year in enumerate(years):
    ax = axes[idx]
    data = grouped_gs_qna_df[grouped_gs_qna_df['year'] == year]
    sns.countplot(
        data=data,
        x='quarter',
        hue='dominant_topic_k11',
        palette='tab10',
        ax=ax
    )
    ax.set_title(f'Distribution of Dominant Topics (k=11) by Quarter - {year}')
    ax.set_xlabel('Quarter')
    ax.set_ylabel('Number of Documents')
    ax.legend(
        title='Dominant Topic',
        loc='upper right',
        labels=[topic_labels_map.get(i, f"Topic {i}") for i in sorted(data['dominant_topic_k11'].unique())]
    )

# Fix color mapping so each topic always has the same color across years
unique_topics = sorted(grouped_gs_qna_df['dominant_topic_k11'].unique())
topic_palette = sns.color_palette('tab10', n_colors=len(unique_topics))
topic_color_dict = {topic: topic_palette[i % len(topic_palette)] for i, topic in enumerate(unique_topics)}

for idx, year in enumerate(years):
    ax = axes[idx]
    data = grouped_gs_qna_df[grouped_gs_qna_df['year'] == year]
    # Use the same color mapping for all years
    sns.countplot(
        data=data,
        x='quarter',
        hue='dominant_topic_k11',
        palette=topic_color_dict,
        ax=ax
    )
    ax.set_title(f'Distribution of Dominant Topics (k=11) by Quarter - {year}')
    ax.set_xlabel('Quarter')
    ax.set_ylabel('Number of Documents')
    handles, labels = ax.get_legend_handles_labels()
    # Always use the same order and labels for legend
    ordered_labels = [topic_labels_map.get(t, f"Topic {t}") for t in unique_topics]
    ax.legend(handles, ordered_labels, title='Dominant Topic', loc='upper right')

plt.suptitle(
    "Quarterly Distribution of Dominant Topics (k=11) by Year\nGoldman Sachs Earnings Call Transcript (Q&A Section)",
    fontsize=16, fontweight='bold', y=1.02
)
plt.tight_layout()
plt.show()

# Save Data Example

In [None]:
import pandas as pd

target_dir = 'data/temp/'
file_name = 'dummy_test_output_new.csv'
dummy_pf = pd.DataFrame({'from_colab': [IS_COLAB, True, 'hello']})


if OUTPUT_PROCESSED_FILES:
    if IS_COLAB:
        AUTHENTICATED_REPO_URL = REPO_URL.replace("https://", f"https://{GITHUB_USERNAME}:{GITHUB_TOKEN}@")
        dummy_pf.to_csv(f"{target_dir}{file_name}", index=False)

        # Configure Git user (important for committing)
        !git config user.email "{GITHUB_EMAIL}"
        !git config user.name "{GITHUB_USERNAME}"
        !git remote set-url origin {AUTHENTICATED_REPO_URL}

        # Add the file to staging
        !git add {target_dir}{file_name}
        print(f"Added '{target_dir}{file_name}' to staging.")

        # Commit the changes
        commit_message = f"Add new data file: {target_dir}{file_name}"
        !git commit -m "{commit_message}"
        print(f"Committed changes with message: '{commit_message}'")
        print(f"Attempted commit with message: '{commit_message}'")

        # Add this line to debug:
        print(f"Value of REPO_BRANCH before push: {REPO_BRANCH}")

        print("Pushing changes to GitHub. Please enter your GitHub username and Personal Access Token when prompted.")
        !git push --set-upstream origin {REPO_BRANCH} --force
        print("Push command executed. Check output for success or prompt.")
    else:
        dummy_pf.to_csv(f"{target_dir}{file_name}", index=False)
        print("Processed files saved successfully.")