In [None]:
####pacakges
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from transformers import pipeline
from sklearn.decomposition import PCA
import openai
import time
import psutil
import tracemalloc
from bertopic.representation import OpenAI
import tiktoken
from bertopic.representation import MaximalMarginalRelevance

nltk.download('stopwords')


In [None]:
dutch_stopwords = stopwords.words('dutch')

###
client = openai.OpenAI(api_key="xxxxxxxxxxxxxx")

# Tokenizer
tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")

# Create your representation model

GPT = OpenAI(
    client,
    model="gpt-3.5-turbo", 
    delay_in_seconds=2, 
    chat=True,
    nr_docs=4,
    doc_length=100,
    tokenizer=tokenizer
)
######
aspect_model2 = [KeyBERTInspired(top_n_words=20), MaximalMarginalRelevance(diversity=.5)]


In [None]:
#Set dispalay 
pd.set_option('display.max_rows', 100)        
pd.set_option('display.max_columns', 20)       
pd.set_option('display.max_colwidth', 100)     
pd.set_option('display.width', 1000)          
pd.set_option('display.precision', 3)          
pd.set_option('display.expand_frame_repr', False)  

In [None]:
##########################
r4 = pd.read_csv('R4FINALPROCESS.csv')
##################
r4 = r4['Review'].tolist() 
##################
r4 = [str(doc) for doc in r4]  


In [None]:
#
process = psutil.Process()
cpu_times_before = process.cpu_times()
memory_usage_before = process.memory_info().rss

tracemalloc.start()
start_time = time.perf_counter()

#############################
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(r4, show_progress_bar=True)

###########################
end_time = time.perf_counter()
memory_usage_after = process.memory_info().rss
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

cpu_times_after = process.cpu_times()
execution_time = end_time - start_time
user_time = cpu_times_after.user - cpu_times_before.user
system_time = cpu_times_after.system - cpu_times_before.system
cpu_usage = ((user_time + system_time) / execution_time) * 100 / psutil.cpu_count()
memory_usage_difference = (memory_usage_after - memory_usage_before) / (1024 ** 2)
memory_usage_peak = peak / (1024 ** 2)

print(f"Execution time: {execution_time} seconds")
print(f"CPU usage during execution: {cpu_usage}%")
print(f"Peak memory usage during execution: {memory_usage_peak} MB")
print(f"Memory usage difference during execution: {memory_usage_difference} MB")


In [None]:
def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

# Step 2: Prepare for Resource Monitoring
process = psutil.Process()
cpu_times_before = process.cpu_times()
memory_usage_before = process.memory_info().rss

tracemalloc.start()
start_time = time.perf_counter()
##############################
np.random.seed(28)


# Initialize and rescale PCA embeddings
pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))

#######################
end_time = time.perf_counter()
memory_usage_after = process.memory_info().rss
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

cpu_times_after = process.cpu_times()
execution_time = end_time - start_time
user_time = cpu_times_after.user - cpu_times_before.user
system_time = cpu_times_after.system - cpu_times_before.system
cpu_usage = ((user_time + system_time) / execution_time) * 100 / psutil.cpu_count()
memory_usage_difference = (memory_usage_after - memory_usage_before) / (1024 ** 2)
memory_usage_peak = peak / (1024 ** 2)

print(f"Execution time: {execution_time} seconds")
print(f"CPU usage during execution: {cpu_usage}%")
print(f"Peak memory usage during execution: {memory_usage_peak} MB")
print(f"Memory usage difference during execution: {memory_usage_difference} MB")
###############

In [None]:
#
process = psutil.Process()
cpu_times_before = process.cpu_times()
memory_usage_before = process.memory_info().rss

tracemalloc.start()
start_time = time.perf_counter()
##############################

umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', init=pca_embeddings, random_state=42)

# Vectorization
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=dutch_stopwords)

# Topic Representation
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)


#######################
end_time = time.perf_counter()
memory_usage_after = process.memory_info().rss
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

cpu_times_after = process.cpu_times()
execution_time = end_time - start_time
user_time = cpu_times_after.user - cpu_times_before.user
system_time = cpu_times_after.system - cpu_times_before.system
cpu_usage = ((user_time + system_time) / execution_time) * 100 / psutil.cpu_count()
memory_usage_difference = (memory_usage_after - memory_usage_before) / (1024 ** 2)
memory_usage_peak = peak / (1024 ** 2)

print(f"Execution time: {execution_time} seconds")
print(f"CPU usage during execution: {cpu_usage}%")
print(f"Peak memory usage during execution: {memory_usage_peak} MB")
print(f"Memory usage difference during execution: {memory_usage_difference} MB")

In [None]:
# 
process = psutil.Process()
cpu_times_before = process.cpu_times()
memory_usage_before = process.memory_info().rss

tracemalloc.start()
start_time = time.perf_counter()
##############################

# Fine-tune parameters for clustering

# Define the parameter ranges
min_cluster_sizes = np.arange(165, 326, 20)
min_samples_values = [3, 4, 5, 15, 30, 50, 100, 150]

for min_cluster_size in min_cluster_sizes:
    for min_samples in min_samples_values:
        # Update HDBSCAN model
        hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', prediction_data=True)

        # Create a BERTopic instance with the updated HDBSCAN model
        topic_model = BERTopic(embedding_model=embedding_model,
                               umap_model=umap_model,
                               hdbscan_model=hdbscan_model,
                               vectorizer_model=vectorizer_model,
                               ctfidf_model=ctfidf_model,
                               language="multilingual",
                               nr_topics="auto")

        # Train the model
        topics, probs = topic_model.fit_transform(r4, embeddings)

        # Evaluate the model
        num_outliers = topics.count(-1)
        num_topics = len(set(topics)) - (1 if -1 in topics else 0)

        # Print the results for each iteration
        print(f"Iteration with min_cluster_size={min_cluster_size}, min_samples={min_samples}")
        print(f"Number of outliers: {num_outliers}")
        print(f"Number of topics: {num_topics}")
        print("-------------------------------------------------")

#######################
end_time = time.perf_counter()
memory_usage_after = process.memory_info().rss
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

cpu_times_after = process.cpu_times()
execution_time = end_time - start_time
user_time = cpu_times_after.user - cpu_times_before.user
system_time = cpu_times_after.system - cpu_times_before.system
cpu_usage = ((user_time + system_time) / execution_time) * 100 / psutil.cpu_count()
memory_usage_difference = (memory_usage_after - memory_usage_before) / (1024 ** 2)
memory_usage_peak = peak / (1024 ** 2)

print(f"Execution time: {execution_time} seconds")
print(f"CPU usage during execution: {cpu_usage}%")
print(f"Peak memory usage during execution: {memory_usage_peak} MB")
print(f"Memory usage difference during execution: {memory_usage_difference} MB")


In [None]:
MIN_CLUSTER_SIZE = 205
MIN_SAMPLES= 50

In [None]:
keybert_model = KeyBERTInspired()


In [None]:
# Reducing dimensional space
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', init=pca_embeddings, random_state=42)

# Clustering
hdbscan_model = HDBSCAN(metric='euclidean', prediction_data=True)

# Vectorization
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=dutch_stopwords)

# Topic Representation
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# Representation Model
representation_model = {
    "main": keybert_model,
    "aspect1": GPT,
   "aspect2": aspect_model2 }

In [None]:
# MODEL FITTING
process = psutil.Process()
cpu_times_before = process.cpu_times()
memory_usage_before = process.memory_info().rss

tracemalloc.start()
start_time = time.perf_counter()
##############################

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,

  # Hyperparameters
  language = "multilingual",
  top_n_words=10,
  calculate_probabilities=True,
  nr_topics="auto",
  verbose=True
)

np.random.seed(42)
# Train model
topics, probs = topic_model.fit_transform(r4, embeddings)

#######################
end_time = time.perf_counter()
memory_usage_after = process.memory_info().rss
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

cpu_times_after = process.cpu_times()
execution_time = end_time - start_time
user_time = cpu_times_after.user - cpu_times_before.user
system_time = cpu_times_after.system - cpu_times_before.system
cpu_usage = ((user_time + system_time) / execution_time) * 100 / psutil.cpu_count()
memory_usage_difference = (memory_usage_after - memory_usage_before) / (1024 ** 2)
memory_usage_peak = peak / (1024 ** 2)

print(f"Execution time: {execution_time} seconds")
print(f"CPU usage during execution: {cpu_usage}%")
print(f"Peak memory usage during execution: {memory_usage_peak} MB")
print(f"Memory usage difference during execution: {memory_usage_difference} MB")

In [None]:
#get topic info
topic_info = topic_model.get_topic_info()
#
topic_info = pd.DataFrame(topic_info)

In [None]:
topic_info.head(17)

In [None]:
#####Extracting output for further analysis
topic_info[['Representation']].to_html('bert4.html')



In [None]:
###Hierarchical visualization for merging topics 
hierarchical_topics = topic_model.hierarchical_topics(r4)
fig = visualize_hierarchy(topic_model, hierarchical_topics)

In [None]:
# Define topics to merge as a list of lists
topics_to_merge = [[2, 5], [6, 7], [4, 9]] 

# Function to merge topics in BERTopic
def merge_topics_bertopic(topic_model, documents, topics_to_merge):
    for topic_group in topics_to_merge:
        if len(topic_group) > 1:
            # Merge topics in the group
            new_topic = topic_model.merge_topics(documents, topic_group)
    return topic_model

# Merge the topics
topic_model = merge_topics_bertopic(topic_model, r4, topics_to_merge)

# Get the updated topic information
topic_info = topic_model.get_topic_info()

# Convert the topic information to a DataFrame
df_topic_info = pd.DataFrame(topic_info)

# Export the DataFrame to a CSV file
csv_file_path = 'topic_infoR4.csv'
df_topic_info.to_csv(csv_file_path, index=False)