In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.analysis_helpers import *
from bertopic import BERTopic

# Define the Documents

In [3]:
df_all = pd.read_csv("../Dataset/meditation_interviews/transcripts_merged.csv")

print(f"Unique conditions before filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews before filtering: {df_all['File Name'].nunique()}")
# *0*: No "real" interview (e.g., setup phase, small talk). We filter these out.
df_all = df_all[df_all["Condition"] != '0']
print(f"Unique conditions after filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews (File Name) after filtering: {df_all['File Name'].nunique()}")

Unique conditions before filtering: ['1' 'C' 'I' '0']
Number of interviews before filtering: 82
Unique conditions after filtering: ['1' 'C' 'I']
Number of interviews (File Name) after filtering: 75


In [4]:
# Preoprocessing
df = df_all.copy()
# Focus only on the participant's or interviewer's speech or both
df = df[df["Speaker"] == "Participant"] # Interviewer

# Focus only on some set of experiment or all
#df = df[df["Experiment"] != "Compassion"]

# Removal of stops-words & punctuation, Lowercased + lemmatize
# Plus Customized Stop-word Removal
extra_stopwords = [
    # Filler Words: Common conversational placeholders without thematic value
    "yeah", "okay", "yes", "mean", "oh", "ah", "like", "kind","kinda", "course", "way",
    # Vague/Ambiguous Words: Frequent but thematically irrelevant in conversations
    "think", "know", "really", "bit", "feel", "thing", "sort", "maybe", "little", "actually",
    "sure", "exactly", "tell", "ask", "people", "think",
    # Broad terms or context-specific words overshadowing subtler themes
    "question", "sorry", "time", "first", "second", "later", "experience", "end", "meditation" #,"body"
]
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, retain_stopwords=["yourself", "myself"]))

# Remove rows with empty content or content that's only punctuation after preprocessing
df = df[df['preprocessed_content'].str.strip().str.len() > 0]

In [5]:
# Split the text into turns by interview (File Name)
df = df.groupby(['File Name','turn_index']).agg({ 
    'Content': ' '.join,  # Combine raw text
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Experiment': 'first',   # Keep the first 
    'Condition': 'first',   # Keep the first
    'Id': 'first',   # Keep the first
    'Speaker': 'first',   # Keep the first
}).reset_index()
df.head(2)

Unnamed: 0,File Name,turn_index,Content,preprocessed_content,Experiment,Condition,Id,Speaker
0,ID 05,1,"So, that was very, let's say, unexpected and s...",let unexpected surprising moment realize disco...,OBE1,1,5,Participant
1,ID 05,3,"It was a little bit like, okay, well, so it's ...",watch outside special lot emotion explain moment,OBE1,1,5,Participant


In [6]:
# Can use as documents before the embedding the preprocessed content or original content
df["Index"] = df.index
docs = list(df.preprocessed_content)
#docs = list(df.Content)
print(len(docs))

668


# Define the model

In [7]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2") # Better but slower: all-mpnet-base-v2 || Trade-off: all-MiniLM-L6-v2
embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

In [8]:
# Fine-tune the numbers of topics generated
# By increasing this value you reduce the number of topics
# Adapt by respect the total number of documents (in our case number of turns)
min_cluster_size = 8

In [9]:
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

# Dimensionality reduction model
umap_model = UMAP(n_neighbors=15, n_components=8, min_dist=0.0, metric='cosine', random_state=42)

# Clustering model
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom')

# Representation model
#representation_model = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=.5)]

# Vectorizer model used in the case we perform clustering with original content
#stops_words = preprocess_text("sample",return_stopwords=True, extra_stopwords=extra_stopwords, retain_stopwords=["yourself", "myself"])
#vectorizer_model = CountVectorizer(stop_words=stops_words)

topic_model = BERTopic(                     
# Pipeline models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
#vectorizer_model=vectorizer_model,
#representation_model=representation_model,

# Hyperparameters
verbose=True)

In [10]:
topics, ini_probs = topic_model.fit_transform(docs, embeddings=embeddings)
num_topics = len(topic_model.get_topics()) 
num_topics

2025-01-22 14:45:55,029 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-22 14:46:20,357 - BERTopic - Dimensionality - Completed ✓
2025-01-22 14:46:20,361 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-22 14:46:20,408 - BERTopic - Cluster - Completed ✓
2025-01-22 14:46:20,416 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-22 14:46:20,480 - BERTopic - Representation - Completed ✓


20

In [11]:
os.makedirs("outputs/topics", exist_ok=True)
topic_model.get_topic_info().to_csv("outputs/topics/topic_names_info.csv",index=False)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,206,-1_myself_look_body_real,"[myself, look, body, real, point, different, f...",[calm relax big natural comfortable environmen...
1,0,47,0_interesting_easy_nice_fine,"[interesting, easy, nice, fine, fun, absolutel...","[interesting, easy nice, nice interesting]"
2,1,46,1_focus_leg_eye_distract,"[focus, leg, eye, distract, try, easy, concent...",[focus myself focus instruction eye closed hea...
3,2,46,2_relax_fall_sleep_asleep,"[relax, fall, sleep, asleep, calm, relaxed, sl...","[calm want sleep, relaxed afraid fall sleep go..."
4,3,44,3_reality_body_different_yourself,"[reality, body, different, yourself, room, out...",[understand differently today want life illusi...
5,4,37,4_body_heavy_light_come,"[body, heavy, light, come, half, phrase, outsi...","[body, body, body]"
6,5,29,5_color_yellow_normal_difference,"[color, yellow, normal, difference, change, di...","[yellow color blue color, notice change color,..."
7,6,23,6_touch_delay_scene_image,"[touch, delay, scene, image, catch, body, touc...",[touch image touch touch difference touch imag...
8,7,23,7_virtual_body_vr_myself,"[virtual, body, vr, myself, hologram, actual, ...",[virtual body mind virtual body look myself ob...
9,8,23,8_forest_rock_tree_adventure,"[forest, rock, tree, adventure, indonesia, riv...",[nice forest love forest good surprise calm re...


In [12]:
topic_model.visualize_barchart(top_n_topics=16)

In [13]:
topic_model.visualize_topics()

In [14]:
#topics_per_class = topic_model.topics_per_class(docs, classes=df.Id)
#topic_model.visualize_topics_per_class(topics_per_class)

In [15]:
# hierarchical_topics = topic_model.hierarchical_topics(docs)
# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [16]:
df['one_topic'] = topics
topic_name_to_id = dict(zip(topic_model.get_topic_info().Topic, topic_model.get_topic_info().Name))
df['one_topic_name'] = df['one_topic'].map(topic_name_to_id)

# Topic Distribution (More than one topic per documents) 
- To reduce the numbers of outliers and avoid False Positive/Negative (useful when looking at one specif topic)

In [17]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 5, calculate_tokens=True)

100%|██████████| 1/1 [00:00<00:00,  9.14it/s]


In [18]:
import tqdm
import numpy as np
import plotly.express as px

tmp_dfs = []

# iterating through different threshold levels
for thr in tqdm.tqdm(np.arange(0, 0.35, 0.001)):
    # calculating number of topics with probability > threshold for each document
    tmp_df = pd.DataFrame(list(map(lambda x: len(list(filter(lambda y: y >= thr, x))), topic_distr))).rename(
        columns = {0: 'num_topics'}
    )
    tmp_df['num_docs'] = 1
    
    tmp_df['num_topics_group'] = tmp_df['num_topics']\
        .map(lambda x: str(x) if x < 5 else '5+')
    
    # aggregating stats
    tmp_df_aggr = tmp_df.groupby('num_topics_group', as_index = False).num_docs.sum()
    tmp_df_aggr['threshold'] = thr
    
    tmp_dfs.append(tmp_df_aggr)

num_topics_stats_df = pd.concat(tmp_dfs).pivot(index = 'threshold', 
                              values = 'num_docs',
                              columns = 'num_topics_group').fillna(0)

num_topics_stats_df = num_topics_stats_df.apply(lambda x: 100.*x/num_topics_stats_df.sum(axis = 1))

# visualisation
colormap = px.colors.sequential.YlGnBu
px.area(num_topics_stats_df, 
       title = 'Distribution of number of topics',
       labels = {'num_topics_group': 'number of topics',
                'value': 'share of reviews, %'},
       color_discrete_map = {
          '0': colormap[0],
          '1': colormap[3],
          '2': colormap[4],
          '3': colormap[5],
          '4': colormap[6],
          '5+': colormap[7]
      })

100%|██████████| 350/350 [00:02<00:00, 166.35it/s]


In [19]:
threshold = 0.25

# Define topic with probability > threshold for each document
df['multiple_topics'] = list(map(
    lambda doc_topic_distr: list(map(
        lambda y: y[0], filter(lambda x: x[1] >= threshold, 
                               (enumerate(doc_topic_distr)))
    )), topic_distr
))
            
df["multiple_topics_name"] = df["multiple_topics"].map(lambda x: [topic_name_to_id.get(i, "No topic") for i in x])

In [20]:
df.to_csv("outputs/topics/df_topic.csv", index = False)
df.head()

Unnamed: 0,File Name,turn_index,Content,preprocessed_content,Experiment,Condition,Id,Speaker,Index,one_topic,one_topic_name,multiple_topics,multiple_topics_name
0,ID 05,1,"So, that was very, let's say, unexpected and s...",let unexpected surprising moment realize disco...,OBE1,1,5,Participant,0,3,3_reality_body_different_yourself,[3],[3_reality_body_different_yourself]
1,ID 05,3,"It was a little bit like, okay, well, so it's ...",watch outside special lot emotion explain moment,OBE1,1,5,Participant,1,-1,-1_myself_look_body_real,"[3, 17]","[3_reality_body_different_yourself, 17_room_ce..."
2,ID 05,5,So I'm not sure I've got all the perfect descr...,perfect description,OBE1,1,5,Participant,2,0,0_interesting_easy_nice_fine,[0],[0_interesting_easy_nice_fine]
3,ID 05,7,"The thing that I didn't really understand, but...",understand understand body touching basically ...,OBE1,1,5,Participant,3,6,6_touch_delay_scene_image,"[0, 6]","[0_interesting_easy_nice_fine, 6_touch_delay_s..."
4,ID 05,9,So I felt like a time lag in the last one of w...,lag body,OBE1,1,5,Participant,4,4,4_body_heavy_light_come,[4],[4_body_heavy_light_come]
