# NLTK - WordCloud

In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import seaborn as sns
import re

from wordcloud import WordCloud
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
nltk.download("all")

In [None]:
# Define stop words and punctuation.
stop_words = stopwords.words('english')

# construct list of common words to exclude that are not covered by stopwords
excludes = ["one", "two", "three", "four", "five", "first", "second", "third", "fourth", "year", "twenty", "quarter", "thousand", "think",
            "question", "see", "also", "would", "thank", "you", "u", "chf", "yes", "sure", "just", "is", "morning", "begin", "actually", "clearly",
            "said", "look", "say", "obviously", "really", "credit", "suisse", "thomas", "group", "david", "c"]

# instantiate WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Function to clean the document and display word cloud.
def word_cloud_display(df, key):
  top_number_of_results = 50
  all_transcripts = df[key].str.lower().str.cat(sep=' ')
  all_text = re.sub('[^A-Za-z]+', ' ', all_transcripts)

  word_tokens = word_tokenize(all_text)

  filtered_all_transcripts = [lemmatizer.lemmatize(word) for word in word_tokens if (word not in stop_words) and not(word.isnumeric()) and (word not in excludes)]

  word_distribution = nltk.FreqDist(filtered_all_transcripts)
  transcript_word_frequency_distribution_df = pd.DataFrame(word_distribution.most_common(top_number_of_results), columns=['Word', 'Frequency'])

  plt.figure(figsize=(8,8))
  sns.set_style("whitegrid")
  ax = sns.barplot(x = "Word", y = "Frequency", data = transcript_word_frequency_distribution_df.head(10))

  plt.figure(figsize = (60,60))
  wc = WordCloud(background_color = 'black', max_words = 1000,  max_font_size = 50)
  wc.generate(' '.join(filtered_all_transcripts))
  plt.imshow(wc)
  plt.axis('off')

In [None]:
bank = ''

# load the transcript file
def load_transcript():
  # get the transcript
  csv_path = f'/final_qa_df.csv'

  return pd.read_csv(csv_path)

transcript_df = load_transcript()

## WordCloud - Quarterly - Original Transcript

In [None]:
# word cloud for each quarter - Original transcript
transcript_grouped_df = transcript_df.groupby(by=["Year", "Quarter"])

for name, groups in transcript_grouped_df:
  word_cloud_display(groups, "Dialogue")

## WordCloud - Quarterly - Summarised Transcript

In [None]:
# word cloud for each quarter - Summarised transcript

for name, groups in transcript_grouped_df:
  word_cloud_display(groups, "Summarised_dialogue")

# BERTopic

In [None]:
# install BERTopic
!pip install bertopic

In [None]:
# import pipeline and BERTopic
from transformers import pipeline
from bertopic import BERTopic

In [None]:
#define function to clean transcript text
def clean_transcript_text(text):
    text = text.lower()
    text = re.sub('[^A-Za-z]+', ' ', text)

    word_tokens = word_tokenize(text)

    for word in word_tokens:
      if ((word in stop_words) or word.isnumeric() or (word in excludes)):
        text = text.replace(" " + word + " ", " ")
      else:
        word_lemma = lemmatizer.lemmatize(word)
        text = text.replace(word, word_lemma)
    return text

#define function to clean transcript text
def clean_transcripts(transcripts_list):
  transcripts_list_clean = []
  for transcript in transcripts_list:
    if transcript is not None and transcript != "nan" and isinstance(transcript, str):
      transcript =  clean_transcript_text(transcript)
      transcripts_list_clean.append(transcript)

  return transcripts_list_clean

In [None]:
# instantiate BERTopic model and fit with transcript
def get_bertopic_model(section, dialogue_col="Dialogue"):
  transcript_answers_df = transcript_df.loc[(transcript_df['Text Type'] == "Answer")]
  if section:
    transcript_answers_df = transcript_answers_df.loc[(transcript_answers_df['Section'] == section)]
  transcript_answers_list = transcript_answers_df[dialogue_col].tolist()
  transcript_answers_list_clean = clean_transcripts(transcript_answers_list)

  model = BERTopic(verbose=False)
  topic, probabilities = model.fit_transform(transcript_answers_list_clean)

  return model

## Original Transcript

### BERTopic - Combined Sections

In [None]:
# "" (empty) for combined dialogues
model = get_bertopic_model("")

In [None]:
# Diplay top frequency topics
model.get_topic_freq().head(10)

In [None]:
model.get_topic(0)

In [None]:
model.get_topic(1)

In [None]:
model.visualize_barchart()

### BERTopic - Presentation

In [None]:
model = get_bertopic_model("Presentation")

In [None]:
# Diplay top frequency topics
model.get_topic_freq().head(10)

In [None]:
model.visualize_barchart()

### BERTopic - Question-and-Answer

In [None]:
model = get_bertopic_model("Question-and-Answer")

In [None]:
# Diplay top frequency topics
model.get_topic_freq().head(10)

In [None]:
model.visualize_barchart()

## Summarised Transcript

### BERTopic - Combined Sections

In [None]:
# "" (empty) for combined dialogues
model = get_bertopic_model("", "Summarised_dialogue")

In [None]:
# Diplay top frequency topics
model.get_topic_freq().head(10)

In [None]:
model.visualize_barchart()

### BERTopic - Presentation

In [None]:
model = get_bertopic_model("Presentation", "Summarised_dialogue")

In [None]:
# Diplay top frequency topics
model.get_topic_freq().head(10)

In [None]:
model.visualize_barchart()

### BERTopic - Question-and-Answer

In [None]:
model = get_bertopic_model("Question-and-Answer", "Summarised_dialogue")

In [None]:
# Diplay top frequency topics
model.get_topic_freq().head(10)

In [None]:
model.visualize_barchart()

# Gensim - LDA

In [None]:
!pip install gensim nltk datasets pyLDAvis ipykernel

In [None]:
import nltk
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
# preprocess/clean the text
def preprocess(text):
    text = text.lower()
    # Remove special characters, digits, and extra whitespace
    text = re.sub(r"[^a-z\s]", "", text)  # Remove non-alphabetic characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace

    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation and word not in excludes]
    return tokens

In [None]:
# load transcript from the csv file
transcript_df = load_transcript()

# get the appropriate section of the transcript
def get_transcripts(section):
  transcript_answers_df = transcript_df.loc[(transcript_df['Text Type'] == "Answer")]
  if section:
    transcript_answers_df = transcript_answers_df.loc[(transcript_answers_df['Section'] == section)]

  return transcript_answers_df

# Analyse each quarter
def get_topics_per_quarter(transcript_answers_df, dialogue_col="Dialogue"):

  transcript_answers_grouped_df = transcript_answers_df.groupby(by=["Year", "Quarter"])

  for name, groups in transcript_answers_grouped_df:
    lda_model = None
    documents = groups["Dialogue"].tolist()
    if len(documents) > 0:
      # Apply preprocessing
      processed_docs = [preprocess(doc) for doc in documents]

      # Create a dictionary representation of the documents
      dictionary = Dictionary(processed_docs)

      # Create a bag-of-words corpus
      corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

      #lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=45, random_state=24)
      lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            chunksize=2000,
            alpha='auto',
            eta='auto',
            iterations=400,
            num_topics=5,
            passes=30,
            eval_every=None
          )

      # Calculate the coherence score
      coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
      coherence_score = coherence_model_lda.get_coherence()

      print(f'Year: {groups["Year"].tolist()[0]}, Quarter: {groups["Quarter"].tolist()[0]}, Coherence Score: {coherence_score}, Perplexity: {lda_model.log_perplexity(corpus)}')

      for idx, topic in lda_model.print_topics(-1):
        print(f"Topic #{idx + 1}: {topic}")

## Original Transcript

### Gensim - LDA - Combined Sections

In [None]:
# "" (empty) for combined transcripts
transcript_answers_df = get_transcripts("")

documents = transcript_answers_df["Dialogue"].tolist()

# Apply preprocessing
processed_docs = [preprocess(doc) for doc in documents]

In [None]:
# Detect bigrams in the corpus
bigram = Phrases(processed_docs, min_count=2, threshold=10)  # Adjust min_count and threshold as needed
bigram_mod = Phraser(bigram)

# Apply bigram model to documents
bigram_docs = [bigram_mod[doc] for doc in processed_docs]

In [None]:
# Create a dictionary representation of the documents
dictionary = gensim.corpora.Dictionary(bigram_docs)

# Filter out extremes to limit the number of features
dictionary.filter_extremes(no_below=2, no_above=0.5)

# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in bigram_docs]


In [None]:
# Grid search best hyperparameters for LDA model using coherence_score
def compute_coherence_values(dictionary, corpus, processed_docs, limit=10, start=5, step=1):
    coherence_values = []

    best_coherence_score = 0
    best_lda_model = None

    for num_topics in range(start, limit, step):
          # Train the LDA model
          lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            chunksize=2000,
            alpha='auto',
            eta='auto',
            iterations=400,
            num_topics=num_topics,
            passes=30,
            eval_every=None
          )

          # Calculate the coherence score
          coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
          coherence_score = coherence_model_lda.get_coherence()

          if coherence_score > best_coherence_score:
            best_coherence_score = coherence_score
            best_lda_model = lda_model

          coherence_values.append({"num_topics": num_topics,  "Coherence Score": coherence_score})

    return coherence_values, best_lda_model, best_coherence_score

In [None]:
coherence_values, lda_model, coherence_score = compute_coherence_values(dictionary, corpus, bigram_docs)

coherence_values_df = pd.DataFrame(coherence_values)

# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
print('Coherence Score: ', coherence_score)


In [None]:
# Print the topics and their words
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic #{idx + 1}: {topic}")

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Prepare the visualisation.
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

### Gensim - LDA - Combined - per quarter

In [None]:
# "" (empty) for combined transcripts
transcript_answers_df = get_transcripts("")
get_topics_per_quarter(transcript_answers_df)

### Gensim - LDA - Presentation - per quarter

In [None]:
transcript_answers_df = get_transcripts("Presentation")
get_topics_per_quarter(transcript_answers_df)

### Gensim - LDA - Question-and-Answer - per quarter

In [None]:
transcript_answers_df = get_transcripts("Question-and-Answer")
get_topics_per_quarter(transcript_answers_df)

## Summarised Transcript

### Gensim - LDA - Combined - per quarter

In [None]:
transcript_answers_df = get_transcripts("")
get_topics_per_quarter(transcript_answers_df, "Summarised_dialogue")

### Gensim - LDA - Presentation - per quarter

In [None]:
transcript_answers_df = get_transcripts("Presentation")
get_topics_per_quarter(transcript_answers_df, "Summarised_dialogue")

### Gensim - LDA - Question-and-Answer - per quarter

In [None]:
transcript_answers_df = get_transcripts("Question-and-Answer")
get_topics_per_quarter(transcript_answers_df, "Summarised_dialogue")