In [198]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [199]:
from utils.analysis_helpers import *
from bertopic import BERTopic

# Define the Docs

In [200]:
df_all = pd.read_csv("../Dataset/meditations_transcripts/transcripts_merged.csv")

print(f"Unique conditions before filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews before filtering: {df_all['File Name'].nunique()}")
# *0*: No "real" interview (e.g., setup phase, small talk). We filter these out.
df_all = df_all[df_all["Condition"] != '0']
print(f"Unique conditions after filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews (File Name) after filtering: {df_all['File Name'].nunique()}")

Unique conditions before filtering: ['1' 'C' 'I' '0']
Number of interviews before filtering: 82
Unique conditions after filtering: ['1' 'C' 'I']
Number of interviews (File Name) after filtering: 75


In [201]:
# Preoprocessing
df = df_all.copy()
# Focus only on the participant's or interviewer's speech or both
df = df[df["Speaker"] == "Participant"]
#df = df[df["Speaker"] == "Interviewer"]

# Select experiment
#df = df[df["Experiment"] != "Compassion"]

# Filter tokens (stopwords, punctuations) were removed using the spacy library, 
# All text was lowercased + lemmatized,
# Plus extra_stopwords being the most frequents words in the corpus AND being meaningless 
extra_stopwords = [
    # Filler Words: Common conversational placeholders without thematic value
    "yeah", "okay", "yes", "mean", "oh", "ah", "like", "kind","kinda", "course", "way",
    # Vague/Ambiguous Words: Frequent but thematically irrelevant in conversations
    "think", "know", "really", "bit", "feel", "thing", "sort", "maybe", "little", "actually",
    "sure", "exactly", "tell", "ask", "people", "think",
    # Broad terms or context-specific words overshadowing subtler themes
    "question", "sorry", "time", "first", "second", "later", "experience", "end", "meditation" #,"body"
]
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, retain_stopwords=["yourself", "myself"]))

# Remove rows with empty content or content that's only punctuation after preprocessing
df = df[df['preprocessed_content'].str.strip().str.len() > 0]
# File S225 is removed because it is empty after preprocessing (only two utterances, not meaningful words for topics)
 # -> It was the Intervention condition which was removed, so the participant only discuss during the sencond condition interview (control)

df.to_csv("preprocessed_content.csv", index=False)

In [202]:
df = df.groupby(['File Name','turn_index']).agg({ 
    'Content': ' '.join,  # Combine raw text
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Experiment': 'first',   # Keep the first 
    'Condition': 'first',   # Keep the first
    'Id': 'first',   # Keep the first
    'Speaker': 'first',   # Keep the first
}).reset_index()
df.head(2)

Unnamed: 0,File Name,turn_index,Content,preprocessed_content,Experiment,Condition,Id,Speaker
0,ID 05,1,"So, that was very, let's say, unexpected and s...",let unexpected surprising moment realize disco...,OBE1,1,5,Participant
1,ID 05,3,"It was a little bit like, okay, well, so it's ...",watch outside special lot emotion explain moment,OBE1,1,5,Participant


In [203]:
df["Index"] = df.index
docs = list(df.preprocessed_content)
#docs = list(df.Content)
print(len(docs))

668


# Define the model

In [204]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2") # Better but slower: all-mpnet-base-v2 || Trade-off: all-MiniLM-L6-v2
embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

In [205]:
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

# Dimensionality reduction model
umap_model = UMAP(n_neighbors=15, n_components=8, min_dist=0.0, metric='cosine', random_state=42)

# Clustering model
hdbscan_model = HDBSCAN(min_cluster_size=8, metric='euclidean', cluster_selection_method='eom')

# Representation model
#representation_model = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=.5)]

# Vectorizer model used if we perform clustering with orignal Content
#stops_words = preprocess_text("sample",return_stopwords=True, extra_stopwords=extra_stopwords, retain_stopwords=["yourself", "myself"])
#vectorizer_model = CountVectorizer(stop_words=stops_words)

topic_model = BERTopic(                     
# Pipeline models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
#vectorizer_model=vectorizer_model,
#representation_model=representation_model,

# Hyperparameters
verbose=True)

In [206]:
topics, ini_probs = topic_model.fit_transform(docs, embeddings=embeddings)
num_topics = len(topic_model.get_topics()) 
num_topics

2025-01-17 14:16:56,765 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-17 14:16:58,478 - BERTopic - Dimensionality - Completed ✓
2025-01-17 14:16:58,480 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-17 14:16:58,547 - BERTopic - Cluster - Completed ✓
2025-01-17 14:16:58,555 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-17 14:16:58,612 - BERTopic - Representation - Completed ✓


23

In [207]:
topic_model.get_topic_info().to_csv("topic_model.csv")
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,193,-1_look_point_real_myself,"[look, point, real, myself, place, different, ...",[normal normal eye close look person real clos...
1,0,96,0_myself_body_reality_virtual,"[myself, body, reality, virtual, different, fe...",[need example forget room change forest image ...
2,1,47,1_focus_leg_eye_distract,"[focus, leg, eye, distract, easy, concentrate,...",[focus myself focus instruction eye closed hea...
3,2,35,2_body_heavy_light_come,"[body, heavy, light, come, half, phrase, outsi...","[body, body, body]"
4,3,26,3_calm_relax_relief_stress,"[calm, relax, relief, stress, relaxed, sit, ge...","[calm, calm calm, relax relax open release str..."
5,4,25,4_easy_absolutely_nice_good,"[easy, absolutely, nice, good, fine, pretty, d...","[easy, absolutely fine, easy nice]"
6,5,23,5_touch_delay_scene_image,"[touch, delay, scene, image, body, catch, touc...",[touch image touch touch difference touch imag...
7,6,23,6_forest_rock_tree_adventure,"[forest, rock, tree, adventure, indonesia, riv...",[nice forest love forest good surprise calm re...
8,7,22,7_eye_open_close_condition,"[eye, open, close, condition, 30, view, easy, ...","[eye close, close close normally close eye ope..."
9,8,17,8_voice_platform_rock_space,"[voice, platform, rock, space, cool, nice, hea...","[platform rock, voice rock rock touch, space r..."


In [208]:
topic_model.visualize_barchart(top_n_topics=16)

In [209]:
topic_model.visualize_topics()

In [190]:
#topics_per_class = topic_model.topics_per_class(docs, classes=df.Id)
#topic_model.visualize_topics_per_class(topics_per_class)

In [214]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 21/21 [00:00<00:00, 82.69it/s]


In [215]:
df['one_topic'] = topics
topic_name_to_id = dict(zip(topic_model.get_topic_info().Topic, topic_model.get_topic_info().Name))
df['one_topic_name'] = df['one_topic'].map(topic_name_to_id)

df.to_csv("df_topic_single.csv", index = False)

# Topic Distribution (More than one topic per docs) 
- To reduce the numbers of outliers and avoid False Positive (Interesting when looking at one specif topic)

### Look similarities

In [216]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append(
            {
                'topic1': t1, 
                'topic2': t2, 
                'distance': rec[t2]
            }
        )

pair_dist_df = pd.DataFrame(tmp)

pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(
      lambda x: not x.startswith('-1'))) & 
            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending = False).head(20)

Unnamed: 0,topic1,topic2,distance
39,0_myself_body_reality_virtual,15_strange_scary_feeling_discomfort,0.734466
25,0_myself_body_reality_virtual,1_focus_leg_eye_distract,0.721013
349,14_fall_sleep_asleep_sleepy,3_calm_relax_relief_stress,0.688762
32,0_myself_body_reality_virtual,8_voice_platform_rock_space,0.61953
54,1_focus_leg_eye_distract,7_eye_open_close_condition,0.618645
493,20_quickly_happen_drop_dealaye,9_want_follow_fell_resistance,0.58668
371,15_strange_scary_feeling_discomfort,2_body_heavy_light_come,0.580514
271,10_strange_weird_funny_creepy,17_interesting_publish_probable_knock,0.576279
29,0_myself_body_reality_virtual,5_touch_delay_scene_image,0.560294
458,18_happy_long_day_mental,20_quickly_happen_drop_dealaye,0.550646


## Multiples topics

In [217]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 5, calculate_tokens=True)

100%|██████████| 1/1 [00:00<00:00,  7.11it/s]


In [218]:
import tqdm
import plotly.express as px

tmp_dfs = []

# iterating through different threshold levels
for thr in tqdm.tqdm(np.arange(0, 0.35, 0.001)):
    # calculating number of topics with probability > threshold for each document
    tmp_df = pd.DataFrame(list(map(lambda x: len(list(filter(lambda y: y >= thr, x))), topic_distr))).rename(
        columns = {0: 'num_topics'}
    )
    tmp_df['num_docs'] = 1
    
    tmp_df['num_topics_group'] = tmp_df['num_topics']\
        .map(lambda x: str(x) if x < 5 else '5+')
    
    # aggregating stats
    tmp_df_aggr = tmp_df.groupby('num_topics_group', as_index = False).num_docs.sum()
    tmp_df_aggr['threshold'] = thr
    
    tmp_dfs.append(tmp_df_aggr)

num_topics_stats_df = pd.concat(tmp_dfs).pivot(index = 'threshold', 
                              values = 'num_docs',
                              columns = 'num_topics_group').fillna(0)

num_topics_stats_df = num_topics_stats_df.apply(lambda x: 100.*x/num_topics_stats_df.sum(axis = 1))

# visualisation
colormap = px.colors.sequential.YlGnBu
px.area(num_topics_stats_df, 
       title = 'Distribution of number of topics',
       labels = {'num_topics_group': 'number of topics',
                'value': 'share of reviews, %'},
       color_discrete_map = {
          '0': colormap[0],
          '1': colormap[3],
          '2': colormap[4],
          '3': colormap[5],
          '4': colormap[6],
          '5+': colormap[7]
      })

100%|██████████| 350/350 [00:05<00:00, 63.19it/s] 


In [219]:
threshold = 0.25

# define topic with probability > threshold for each document
df['multiple_topics'] = list(map(
    lambda doc_topic_distr: list(map(
        lambda y: y[0], filter(lambda x: x[1] >= threshold, 
                               (enumerate(doc_topic_distr)))
    )), topic_distr
))

# creating a dataset with docid, topic
tmp_data = []

for rec in df.to_dict('records'):
    if len(rec['multiple_topics']) != 0:
        mult_topics = rec['multiple_topics']
    else:
        mult_topics = [-1]
        
    for topic in mult_topics: 
        tmp_data.append(
            {
                'topic': topic,
                'id': rec['Content'],
            }
        )
            
mult_topics_df = pd.DataFrame(tmp_data)
df["multiple_topics_name"] = df["multiple_topics"].map(lambda x: [topic_name_to_id.get(i, "No topic") for i in x])


In [220]:
df.to_csv("df_topic_multiples.csv", index = False)
df.head()

Unnamed: 0,File Name,turn_index,Content,preprocessed_content,Experiment,Condition,Id,Speaker,Index,one_topic,one_topic_name,multiple_topics,multiple_topics_name
0,ID 05,1,"So, that was very, let's say, unexpected and s...",let unexpected surprising moment realize disco...,OBE1,1,5,Participant,0,0,0_myself_body_reality_virtual,[0],[0_myself_body_reality_virtual]
1,ID 05,3,"It was a little bit like, okay, well, so it's ...",watch outside special lot emotion explain moment,OBE1,1,5,Participant,1,-1,-1_look_point_real_myself,[0],[0_myself_body_reality_virtual]
2,ID 05,5,So I'm not sure I've got all the perfect descr...,perfect description,OBE1,1,5,Participant,2,-1,-1_look_point_real_myself,[],[]
3,ID 05,7,"The thing that I didn't really understand, but...",understand understand body touching basically ...,OBE1,1,5,Participant,3,5,5_touch_delay_scene_image,"[5, 17]","[5_touch_delay_scene_image, 17_interesting_pub..."
4,ID 05,9,So I felt like a time lag in the last one of w...,lag body,OBE1,1,5,Participant,4,2,2_body_heavy_light_come,[2],[2_body_heavy_light_come]
