# Bertopic Experiment Notebook - Reduced Dataset for quick processing

## Import Data

In [None]:
# Imports
import pandas as pd 

# Dataset Path 
# Data Downloaded from https://www.consumerfinance.gov/data-research/consumer-complaints/#download-the-data
data_path = r"C:\TEMP\REPO\Topic_Modelling\Topic_Modelling\BERTopic\datasets\complaints.csv"

# Filter Columns
cols_to_keep = ['Date received','Product','Sub-product','Issue','Sub-issue','Consumer complaint narrative', 'Company public response', 'Company response to consumer', 'Tags', 'Company']

# Filter dataset condition where length of complaint narrative <=5000 (As per Max form character input)
# Ingest data by chunks and filter
chunks = pd.read_csv(data_path, 
                 usecols=cols_to_keep,
                 chunksize=5000)
df = pd.concat(chunk[chunk['Consumer complaint narrative'].str.len() < 5000] for chunk in chunks)

df.head()

In [None]:
# Filter by one company's complaints only
df_date = df[df['Company'] == 'WELLS FARGO & COMPANY']

# Drop rows where customer complaints were null
df_date = df_date.dropna(subset='Consumer complaint narrative')

# Create year month column to filter by
df_date['Date received'] = pd.to_datetime(df_date['Date received'])
df_date['year_month'] = df_date['Date received'].dt.to_period('M')

# Display row counts per month
grouped_df = df_date.groupby('year_month').size().reset_index(name='counts')
filtered_df = grouped_df[grouped_df['year_month'] > '2023']
filtered_df

In [None]:
# Filter latest complaint data to filter volumes to something more realistic
rdf = df_date[(df_date['year_month'] > '2024-06')]
rdf.shape

In [None]:
rdf.head()

In [5]:
# Set display options for DF viewing (wrap text)
pd.set_option('display.max_colwidth', None)  # Set to None to display full text
pd.set_option('display.width', 1000)  # Set the display width to a large value

In [None]:
# EDA and an initial model output showed patterns of PI that converted to X's that was coming through to the output due to the prevalence 
    # Therefore, Remove any pattern of 'X's in 'pii_column'
    # And Replace XX/XX/XXXX as [DATE]

rdf['Consumer complaint narrative preprocessed'] = rdf['Consumer complaint narrative'].str.replace(r'XX/XX/\d{4}', '[DATE]', regex=True)
rdf['Consumer complaint narrative preprocessed'] = rdf['Consumer complaint narrative preprocessed'].str.replace('X+', '', regex=True)
rdf['Consumer complaint narrative preprocessed'] = rdf['Consumer complaint narrative preprocessed'].str.replace('//', ' ', regex=True)
rdf.head(1)

## Online resources for improving the performance of BERTopic 
1. https://github.com/MaartenGr/BERTopic/issues/423 - Reducing the number of outliers
2. https://maartengr.github.io/BERTopic/faq.html - FAQ's (including reducing the number of outliers)
3. https://github.com/UKPLab/sentence-transformers/issues/888 - Using SentenceTransformers offline
4. https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#finding-similar-topics-between-models - Tips and tricks (Further approaches for refining the outputs and improving performance)





## Experiment 1
### Train BERTopic - Initial Test
Data volumes ~ 1300 documents/complaints

In [95]:
docs = rdf['Consumer complaint narrative preprocessed'].to_list()

In [None]:
from bertopic import BERTopic
# from keybert import KeyBERT

# Initialize BERTopic
topic_model = BERTopic()

# Fit the model on the text data
topics, probs = topic_model.fit_transform(docs)

# Get topic information
topic_info = topic_model.get_topic_info()

# Display the topics
print(topic_info)

### Experiment 1 Results
1. Only 1 topic was generated from a count of 29 complaints
2. Volumes may be too low to get any meaninngful embeddings
3. Topic includes pretty much all stop words and no meaningful insights 
4. The visual shows sparsity in the output topics meaning either the data requires preprocessing or there is an issue with the clustering algorithm on this data.

Poor preprocessing can lead to poor topic modelling results with BERTopic. Preprocessing techniques like removing stopwords, stemming and lemmatization could be tried.
Preprocessing can help reduce noise and improve embedding quality ensuring the embeddings capture the essential semantics of the text. 

In [9]:
# Change the formatting of the Topic Names
topic_labels = topic_model.generate_topic_labels(nr_words=4, separator=" - ")
topic_model.set_topic_labels(topic_labels)

In [None]:
# Visualise Topics
topic_model.visualize_barchart(width=350, height=430, top_n_topics=8, n_words=10, custom_labels=topic_labels)

In [None]:
# Documents and Topics
topic_model.visualize_documents(docs, topics=list(range(17)), custom_labels=True, height=600)

### Experiment 1.1 - Reducing min_topic_size to reduce the number of documents placed into each cluster 
1. This has significantly improved the intepretability of the model outputs, the topics and the topic representations
2. min_topic_size should be reduced and makes sense to be explicity specified for smaller volumes of documents (~1300)

In [None]:
from bertopic import BERTopic
# from keybert import KeyBERT

# Initialize BERTopic
topic_model = BERTopic(min_topic_size=10)

# Fit the model on the text data
topics, probs = topic_model.fit_transform(docs)

# Get topic information
topic_info = topic_model.get_topic_info()

# Display the topics
print(topic_info)

## Experiment 2
Removing Stopwords due to poor topic interpretability in Experiment 1

In [None]:
import nltk
from nltk.corpus import stopwords

srdf = rdf

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the function to the DataFrame
srdf['Consumer complaint narrative preprocessed'] = srdf['Consumer complaint narrative preprocessed'].apply(remove_stopwords)

srdf.head


In [None]:
docs = srdf['Consumer complaint narrative preprocessed'].to_list()

from bertopic import BERTopic

# Initialize BERTopic
topic_model_2 = BERTopic()

# Fit the model on the text data
topics, probs = topic_model_2.fit_transform(docs)

# Get topic information
topic_info = topic_model_2.get_topic_info()

# Change the formatting of the Topic Names
topic_labels = topic_model_2.generate_topic_labels(nr_words=4, separator=" - ")
topic_model_2.set_topic_labels(topic_labels)

# Display the topics
topic_model_2.visualize_barchart(width=400, height=430, top_n_topics=8, n_words=10, custom_labels=topic_labels)

In [None]:
print(topic_info)

### Experiment 2 Results
1. Removed stopwords seemed to have improved performance significantly upon visual analysis of outputted topics
2. Topics and topic representations (words) seem to be distinct, interpretable themes 
3. 16, non-outlier topics
4. Visualisation below shows distict clusters of documents relating to their representations
5. A few representations include duplicate words with similar stems. E.g. Fraud - Fraudulent, Check - Checks  - This could potentially be solved by pre-processing using stemming and lematisation

In [None]:
# Documents and Topics
topic_model_2.visualize_documents(docs, topics=list(range(17)), custom_labels=True, height=600)

## Experiment 3
n-gram range = (1,3-5)

In [None]:
# Change n-grams to (1,3) without re-fitting entire model - The default n-gram range for BERTopic is (1, 2).
topic_model_2.update_topics(docs, n_gram_range=(1,3))
# Display the topics
topic_model_2.visualize_barchart(width=400, height=430, top_n_topics=8, n_words=10, custom_labels=topic_labels)

### Experiment 3 outputs - visual analysis
1. Using an n-gram range of (1,3) over the default of (1,2) allow for a few combinations of the words we have already seen in the representations but this gives a bit more interpretability in my opinion. E.g. For topic 1 - 'debit card' (as opposed to 'debit' by itself - meaning debit into an account) and topic 6 - 'closed account' & 'credit card' (as opposed to 'credit' by itself - meaning credit to an account or credit reporting)
2. 'wells', 'fargo' and 'wells fargo' appear potentially too often in each of the topics so it's worth removing them as stopwords.
3. There is very little difference between the topics and their representations of (1,3) and (1,4), but the topics, it's representations and the latent meaning behind each for (1,5) does seem to shift which is interesting


In [None]:
# Change n-grams to (1,4) without re-fitting entire model - The default n-gram range for BERTopic is (1, 2).
topic_model_2.update_topics(docs, n_gram_range=(1,4))
# Display the topics
topic_model_2.visualize_barchart(width=400, height=430, top_n_topics=8, n_words=10, custom_labels=topic_labels)

In [None]:
# Change n-grams to (1,5) without re-fitting entire model - The default n-gram range for BERTopic is (1, 2).
topic_model_2.update_topics(docs, n_gram_range=(1,5))
# Display the topics
topic_model_2.visualize_barchart(width=400, height=430, top_n_topics=8, n_words=10, custom_labels=topic_labels)

## Experiment 4
Stemming and Lemmatisation for n-gram range (1,3) 

1. Lemmatization First: Converts words to their base forms considering the context.
2. Stemming Second: Further reduces words to their root forms if needed.

Note: Lemmatization should be applied first to ensure that words are converted to their base forms considering the context, and then stemming can be applied if further reduction is needed

In [97]:
# Remove Wells Fargo as a stopword
import re

def remove_custom_stopwords(text, stopwords):
    # Create a regex pattern for the stopwords
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in stopwords) + r')\b', re.IGNORECASE)
    # Remove the stopwords
    return pattern.sub('', text).strip()

# List of custom stopwords
custom_stopwords = ['wells', 'fargo', 'wells fargo', 'well']

docs = srdf['Consumer complaint narrative preprocessed'].to_list()

cleaned_complaints = [remove_custom_stopwords(text, custom_stopwords) for text in docs]


##### Lemmatization Only - Contextual reduction of words to their base/dictionary form
As to reduce repeated similar words.

1. Stopwords were already previously removed (Including Wells Fargo)
2. Applied lemmatization only (Not Stemming)
3. Applied min_topic_size=10 & n_gram_range=(1,3)

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vectorizer_model = CountVectorizer(tokenizer=LemmaTokenizer())
topic_model_lemstem = BERTopic(vectorizer_model=vectorizer_model, min_topic_size=10, n_gram_range=(1,3))

# Fit the model on the text data
topics, probs = topic_model_lemstem.fit_transform(cleaned_complaints)

# Get topic information
topic_info_lemstem = topic_model_lemstem.get_topic_info()

# Change the formatting of the Topic Names
topic_labels_lemstem = topic_model_lemstem.generate_topic_labels(nr_words=4, separator=" - ")
topic_model_lemstem.set_topic_labels(topic_labels_lemstem)

# Display the topics
topic_model_lemstem.visualize_barchart(width=400, height=430, top_n_topics=8, n_words=10, custom_labels=topic_labels_lemstem)


In [None]:
print(topic_model_lemstem.get_topic_info())

In [None]:
# Documents and Topics
topic_model_lemstem.visualize_documents(docs, topics=list(range(17)), custom_labels=True, height=600)

#### Results of Lemmatising only experiment
1. Highest performance on interpretability seen out of all the experiments so far
2. 0 topic can be further refined as the topic representations for this topic are vague and volumes are high - The visualisation of the cluster shows a disparsed cluster of complaints/documents which further reinforces this
3. -1, outlier, topic can be further refined too.  

#### Further Outputs and visualisations for this experiment

In [None]:
# Word Cloud for Topic 3
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Show wordcloud
create_wordcloud(topic_model_lemstem, topic=3)


In [None]:
# View 10 complaints assigned to topic 3
document_info_df = topic_model_lemstem.get_document_info(docs=cleaned_complaints)
document_info_df = document_info_df[document_info_df['Topic'] ==3][['Document', 'Topic', 'Name', 'Probability']][0:9]

pre_docs = rdf['Consumer complaint narrative'].to_list()

# Link each document back to the original docs list
document_info_df['Original Document'] = document_info_df.index.map(lambda idx: pre_docs[idx])

# Display the updated DataFrame
document_info_df[['Document', 'Original Document', 'Topic', 'Name', 'Probability']].head(10)

In [None]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
lancaster = LancasterStemmer()

# Function to apply stemming and lemmatization
def lemmatize_and_stem(text):
    tokens = word_tokenize(text.lower())
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed = [stemmer.stem(token) for token in lemmatized]
    return ' '.join(stemmed)

# Apply the function to the list of docs
processed_docs = [lemmatize_and_stem(complaints) for complaints in cleaned_complaints]

print(docs[0])
print(processed_docs[0])


In [None]:
from bertopic import BERTopic

# Initialize BERTopic
topic_model_lemstem = BERTopic()

# Fit the model on the text data
topics, probs = topic_model_lemstem.fit_transform(processed_docs)

# Get topic information
topic_info_lemstem = topic_model_lemstem.get_topic_info()

# Change the formatting of the Topic Names
topic_labels_lemstem = topic_model_lemstem.generate_topic_labels(nr_words=4, separator=" - ")
topic_model_lemstem.set_topic_labels(topic_labels_lemstem)

# Display the topics
topic_model_lemstem.visualize_barchart(width=400, height=430, top_n_topics=8, n_words=10, custom_labels=topic_labels_lemstem)

## Experiment 5
KeyBERT with n-gram (1,3)

In [None]:
from bertopic.representation import KeyBERTInspired

# Initialize BERTopic with custom settings
representation_model = KeyBERTInspired()
Key_topic_model = BERTopic(representation_model=representation_model, calculate_probabilities=True, n_gram_range=(1, 3))

docs = srdf['Consumer complaint narrative preprocessed'].to_list()

# Fit the model on your documents
topics, probs = Key_topic_model.fit_transform(docs)

# # Save the trained topic model for later so training is not required again
# Key_topic_model.save(r"C:\TEMP\REPO\Topic_Modelling\Topic_Modelling\BERTopic\models\test_cf_complaints_KEYbertopic_model_small")

# Get topic information
key_topic_info = Key_topic_model.get_topic_info()

# Change the formatting of the Topic Names
key_topic_labels = Key_topic_model.generate_topic_labels(nr_words=4, separator=" - ")
Key_topic_model.set_topic_labels(key_topic_labels)

# Display the topics
Key_topic_model.visualize_barchart(width=400, height=430, top_n_topics=8, n_words=10, custom_labels=key_topic_labels)


In [None]:
print(key_topic_info)