# BERTopic Walkthrough notebook

### Set-up

In [None]:
# !pip install --upgrade --quiet bertopic
# !pip install --upgrade --quiet google-cloud-aiplatform==1.41.0
# !pip install --upgrade --quiet langchain==0.1.6 langchain-google-vertexai==0.0.5
# !pip install --upgrade --quiet PyPDF==4.0.1
# !pip install --upgrade --quiet chromadb==0.4.22
# !pip install --upgrade --quiet ragas==0.1.3
# !pip install --upgrade --quiet tensorflow==2.15

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Configurations

In [None]:
import os
PROJECT_ID = ""
# Get Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

In [None]:
BUCKET_NAME="playpen-basic-gcp_dv_npd-" + PROJECT_ID + "-bucket"
BUCKET_URL="gs://" + BUCKET_NAME
print("Bucket NAME: ", BUCKET_NAME)
print("Bucket URL: ", BUCKET_URL)

In [None]:
FILE_BLOB = "rag/fg21-1.pdf"    # Ref.[1]
print("FILE BLOB: ", FILE_BLOB)

In [None]:
REGION = 'europe-west2'  # London

In [None]:
SERVICE_ACCOUNT = "playpen-5b5a22-consumer-sa@playpen-5b5a22.iam.gserviceaccount.com"  # to be updated per project and service account

### Initialise Vertex AI

In [None]:
import vertexai
vertexai.init(project=PROJECT_ID, location=REGION)

## Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import regex as re
import pandas as pd
from tqdm import tqdm

In [None]:
def get_all_pdf_links(entry_page_url):
    """Extract all pdf links from an url and return a DataFrame with title and pdf url as columns"""

    response = requests.get(url=entry_page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    download_links = soup.find_all(class_="search-result")

    df = pd.DataFrame([
        {"title": pdf_link.find("h4").string, "url": "https://www.financial-ombudsman.org.uk/" + pdf_link.get("href")}
        for pdf_link in download_links
    ])

    return df

In [None]:
def get_fos_url(date_from : str  = "2024-01-01" , date_to: str = "2024-01-01", industry_sector_ID: str = "IndustrySectorID%5B1%5D=1"):
    """
    Scrapes text date from (pdf) reports from the FOS Decision website.
    """
    entry_page_url = f"https://www.financial-ombudsman.org.uk/decisions-case-studies/ombudsman-decisions/search?{industry_sector_ID}&DateFrom={date_from}&DateTo={date_to}"
    # Regular expression pattern to match the desired sentence
    pattern = r"Your search returned (\d+) results"

    response = requests.get(url=entry_page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the matching sentence
    matching_sentence = soup.find(string=re.compile(pattern))

    # Extract the numeric value
    if matching_sentence:
        match = re.search(pattern, matching_sentence)
        result_count = int(match.group(1))
        print(f"Found {result_count} files.")
    else:
        print("No matching sentence found.")
        return None

    total_results_pages = int(result_count/10)+1

    # df_list =[]
    pdf_urls_df = pd.DataFrame()

    for i in tqdm(range(total_results_pages)):
        pdf_urls = entry_page_url+f"&Start={i*10}"
        pdf_urls_df = pd.concat([pdf_urls_df,get_all_pdf_links(pdf_urls)], axis=0, ignore_index=True)

    return pdf_urls_df

In [None]:
pdf_url_df = get_fos_url(date_from="2023-12-25", date_to="2024-01-01")


In [None]:
pdf_url_df.tail()


### Loading PDF Documents

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def check_url_exist(url):
    """To check the url endpoint does exist"""
    response = requests.get(url=url)
    return response.status_code == requests.codes.ok

In [None]:
docs = []

for index, row in tqdm(pdf_url_df.iterrows(), total=pdf_url_df.shape[0]):
    if check_url_exist(row.url):

        # -- Loading a pdf file --
        pdf_url = row.url

        loader = PyPDFLoader(pdf_url)
        doc = loader.load()

        splitter = RecursiveCharacterTextSplitter(
                                            chunk_size=1001,
                                            chunk_overlap=250,
                                            separators=["\n\n", "\n", "\. ", " ", ""]
                                        )
        splits = splitter.split_documents(doc)

        docs.extend(splits)

In [None]:
for doc in docs:
    doc.metadata['file_name'] = doc.metadata['source']

In [None]:
# Turn documents into strings, ignoring the metadata
docs_str = []
for doc in docs:
    doc_str = doc.dict()["page_content"]
    docs_str.append(doc_str)

## Running initial BERTopic (no tuning)

In [None]:
from bertopic import BERTopic

# Define and fit documens
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs_str)

In [None]:
# Show topic information
topic_model.get_topic_info()

# BERTopic Full Process

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

In [None]:
# Sentence embedding
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Dimensionality Reduction
umap_model = UMAP(
    n_neighbors=5,
    n_components=5,
    min_dist=0.05,
    metric="cosine",
    random_state=42
)

# Clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

# Tokenizer
vectorizer_model = CountVectorizer(
    stop_words = "english",
    # ngram_range=(1,2),
)

# Topic representation
ctfidf_model = ClassTfidfTransformer(
    reduce_frequent_words=True,
)

# Fine-Tune Representations
keybert_representation = {"keybert": KeyBERTInspired()}

We can likely play around with different options available here. Notably the parameters in HDBSCAN and the different representations available.

We could also experiment with different sentence embeddings at the start.

In [None]:
bert_model = BERTopic(
    nr_topics="auto",
    verbose=True,
    vectorizer_model = vectorizer_model,
    ctfidf_model = ctfidf_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    min_topic_size=1,
    representation_model = keybert_representation,
    embedding_model = sentence_model
)

In [None]:
topics, _ = bert_model.fit_transform(docs_str)

In [None]:
topic_labels = bert_model.generate_topic_labels(
    nr_words=5,
    topic_prefix=True,
    word_length=24,
    separator="_",
)
topic_labels

In [None]:
topics_info = bert_model.get_topics()
topics_info

In [None]:
bert_model.get_topic_info()

Can we use built-in LLM capability to generate the labels instead?

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain_google_vertexai import VertexAI
from bertopic.representation import LangChain

In [None]:
chain = load_qa_chain(VertexAI(model_name='gemini-pro', temperature=0.2))

In [None]:
prompt = "In three words, describe what these documents are about."
representation_model = LangChain(chain, prompt=prompt)

In [None]:
# Sentence embedding
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Dimensionality Reduction
umap_model = UMAP(
    n_neighbors=5,
    n_components=5,
    min_dist=0.05,
    metric="cosine",
    random_state=42
)

# Clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

# Tokenizer
vectorizer_model = CountVectorizer(
    stop_words = "english",
    # ngram_range=(1,2),
)

# Topic representation
ctfidf_model = ClassTfidfTransformer(
    reduce_frequent_words=True,
)

In [None]:
bert_model = BERTopic(
    nr_topics="auto",
    verbose=True,
    vectorizer_model = vectorizer_model,
    ctfidf_model = ctfidf_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    min_topic_size=1,
    representation_model = representation_model,
    embedding_model = sentence_model
)

In [None]:
topics, _ = bert_model.fit_transform(docs_str)

In [None]:
topic_labels = bert_model.generate_topic_labels(
    nr_words=5,
    topic_prefix=True,
    word_length=24,
    # separator="_",
)
topic_labels

This current set-up is not functioning properly. Can we find a good prompt to help us?

Further extensions:


*   By default each document only contains one topic. We can output the probabilities of each document belonging to a cluster. Can we generalise and improve the outputs.
*   BERTopic has built in dynamic topic modelling. Could this be useful?

