In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.analysis_helpers import *
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


# Define the Docs

In [43]:
directory = '../interviews_corrected/6_final/**/' 

df_all = load_and_combine_csv(directory)
df_all = standardize_speaker_labels(df_all)

print(f"Unique conditions before filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews before filtering: {df_all['File Name'].nunique()}")
# *0*: No "real" interview (e.g., setup phase, small talk). We filter these out.
df_all = df_all[df_all["Condition"] != 0]
print(f"Unique conditions after filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews (File Name) after filtering: {df_all['File Name'].nunique()}")

Found 82 CSV files.
Unique conditions before filtering: [1 'C' 'I' 0]
Number of interviews before filtering: 82
Unique conditions after filtering: [1 'C' 'I']
Number of interviews (File Name) after filtering: 75


In [45]:
# Preoprocessing
# For the moment only focus on the participants answers
#df = df_all[df_all["Speaker"] == "Participant"].copy()
df = df_all[df_all["Speaker"] == "Interviewer"].copy()

# Select experiment
df = df[df["Experiment"] == "Compassion"]

# Filter tokens (stopwords, punctuation, numeric values) were removed using the spacy library, 
# All text was lowercased + lemmatized,
# Plus extra_stopwords being the most frequents words in the corpus AND being meaningless 
extra_stopwords = [
    # Filler Words: Common conversational placeholders without thematic value
    "yeah", "okay", "yes", "mean", "oh", "ah", "like",
    # Vague/Ambiguous Words: Frequent but thematically irrelevant in conversations
    "think", "know", "really", "bit", "feel", "thing", "sort", "maybe", "little", "actually","exactly","see",
    # Broad terms or context-specific words overshadowing subtler themes
    "part", "one", "sorry", "first", "second", #"meditation",
]
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords))

# Remove rows with empty content or content that's only punctuation after preprocessing
df = df[df['preprocessed_content'].str.strip().str.len() > 0]
# File S225 is removed because it is empty after preprocessing (only two utterances, not meaningful words for topics)

df.to_csv("preprocessed_data.csv", index=False)

In [46]:
df = df.groupby(['File Name','utterance_index']).agg({ 
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Content': ' '.join,  # Combine raw text
    'Experiment': 'first',   # Keep the first 
    'Condition': 'first',   # Keep the first
    'Id': 'first',   # Keep the first
}).reset_index()
df

Unnamed: 0,File Name,utterance_index,preprocessed_content,Content,Experiment,Condition,Id
0,S301final,0,tell experience section relax follow say help,And you can just a little bit tell about your ...,Compassion,1,301
1,S301final,1,wrong right say experience nice problem,There is no wrong or right to just saying your...,Compassion,1,301
2,S301final,2,anonym understand,To be anonym anyway. So just you to understand.,Compassion,1,301
3,S301final,4,nice great,"Very nice, yes. Great.",Compassion,1,301
4,S301final,6,want add,You want to add something?,Compassion,1,301
...,...,...,...,...,...,...,...
137,S313,31,interesting good technical visible disturb fac...,Okay interesting good to know. And yes you hav...,Compassion,1,313
138,S313,33,tell image image guess kind recognize mask emb...,You were told that the image in front of you w...,Compassion,1,313
139,S313,35,locate kinda,Yeah more yes I think so just like did you fee...,Compassion,1,313
140,S313,38,reformulate question form send love example ch...,I can reformulate this question in another for...,Compassion,1,313


In [61]:
docs = list(df.preprocessed_content)
print(len(docs))

142


In [62]:
# look rows with a specific word in the column preprocessed_content
#df[df['preprocessed_content'].str.contains(" see ")]

# Define the model

In [63]:
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

# Dimensionality reduction model
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# Clustering model
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom')

main_representation_model = KeyBERTInspired()
aspect_representation_model1 = PartOfSpeech("en_core_web_sm")
aspect_representation_model2 = [KeyBERTInspired(top_n_words=30), 
                                MaximalMarginalRelevance(diversity=.5)]
representation_model = {
   "Main": main_representation_model,
   "Aspect1":  aspect_representation_model1,
   "Aspect2":  aspect_representation_model2 
}


topic_model = BERTopic(                       
# Pipeline models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
#representation_model = representation_model,

# Hyperparameters
verbose=True,)

Batches: 100%|██████████| 5/5 [00:00<00:00,  8.40it/s]


In [64]:
topics, ini_probs = topic_model.fit_transform(docs)
num_topics = len(topic_model.get_topics()) - 1
num_topics

2025-01-04 01:15:52,632 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 5/5 [00:00<00:00, 14.09it/s]
2025-01-04 01:15:52,998 - BERTopic - Embedding - Completed ✓
2025-01-04 01:15:52,998 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-04 01:15:53,283 - BERTopic - Dimensionality - Completed ✓
2025-01-04 01:15:53,283 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-04 01:15:53,288 - BERTopic - Cluster - Completed ✓
2025-01-04 01:15:53,294 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-04 01:15:53,365 - BERTopic - Representation - Completed ✓


18

In [65]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,24,-1_basically_body_focus_sensation,"[basically, body, focus, sensation, send, self...",[basically quality high manage eye close devel...
1,0,18,0_thank_nice_good_great,"[thank, nice, good, great, stop, excellent, tr...","[thank, great good, nice great]"
2,1,9,1_want_ask_great_add,"[want, ask, great, add, question, share, elena...","[nice great want add, great want ask, question..."
3,2,9,2_difficult_feeling_tiredness_task,"[difficult, feeling, tiredness, task, make, gl...","[difficult probably wondering tiredness, diffi..."
4,3,9,3_follow_guidance_guide_percentage,"[follow, guidance, guide, percentage, instruct...","[follow guidance, follow guidance guy talk fol..."
5,4,8,4_question_tall_speak_anonym,"[question, tall, speak, anonym, building, loud...","[anonym understand, speak louder, tall building]"
6,5,8,5_eye_close_open_correct,"[eye, close, open, correct, long, time, condit...","[eye close eye close, condition eye close cond..."
7,6,7,6_send_love_avatar_body,"[send, love, avatar, body, easy, help, example...","[avatar help send love send love, reformulate ..."
8,7,7,7_compassion_feeling_enable_useful,"[compassion, feeling, enable, useful, self, in...",[question type meditation compassion meditatio...
9,8,6,8_later_mm_round_time,"[later, mm, round, time, , , , , , ]","[mm-, later, later round]"


In [66]:
# # Reduce outliers with pre-calculate embeddings
# new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings", embeddings=embeddings)
# topic_model.update_topics(docs, topics=new_topics)
# topics = new_topics

# # Display new topics
# topic_model.get_topic_info()

In [67]:
topic_model.visualize_barchart(top_n_topics=16)

In [68]:
topic_model.visualize_topics()

In [69]:
#topic_model.visualize_heatmap(n_clusters = 20)

In [70]:
#topics_per_class = topic_model.topics_per_class(docs, classes=df.Experiment)
#topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)

In [71]:
#hierarchical_topics = topic_model.hierarchical_topics(docs)
#topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [72]:
df['one_topic'] = topics
topic_name_to_id = dict(zip(topic_model.get_topic_info().Topic, topic_model.get_topic_info().Name))
df['one_topic_name'] = df['one_topic'].map(topic_name_to_id)

df.to_csv("df_topic.csv", index = False)

In [73]:
topic_name_to_id

{-1: '-1_basically_body_focus_sensation',
 0: '0_thank_nice_good_great',
 1: '1_want_ask_great_add',
 2: '2_difficult_feeling_tiredness_task',
 3: '3_follow_guidance_guide_percentage',
 4: '4_question_tall_speak_anonym',
 5: '5_eye_close_open_correct',
 6: '6_send_love_avatar_body',
 7: '7_compassion_feeling_enable_useful',
 8: '8_later_mm_round_time',
 9: '9_experiment_impression_tell_interesting',
 10: '10_condition_experience_share_70',
 11: '11_blink_notice_intensity_change',
 12: '12_meditation_today_practice_relate',
 13: '13_annoy_reality_virtual_absolutely',
 14: '14_embody_wrong_mask_identify',
 15: '15_difference_meditation_love_work',
 16: '16_avatar_self_right_easy',
 17: '17_pixelation_pixelated_realize_color'}

## Manual topic Modeling

In [74]:
directory = '../interviews_corrected/6_final/**/' 

df_all = load_and_combine_csv(directory)
df_all = standardize_data(df_all)

print(f"Unique conditions before filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews before filtering: {df_all['File Name'].nunique()}")
# *0*: No "real" interview (e.g., setup phase, small talk). We filter these out.
df_all = df_all[df_all["Condition"] != 0]
print(f"Unique conditions after filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews (File Name) after filtering: {df_all['File Name'].nunique()}")

Found 82 CSV files.


NameError: name 'standardize_data' is not defined

In [99]:
# Preoprocessing
# For the moment only focus on the participants answers
df = df_all[df_all["Speaker"] == "Participant"].copy()

# Select experiment
#df = df[df["Experiment"] == "OBE2"]

# Stop words were removed using the NLTK library of stop words. 
# All text was lowercased + lemmatized
# Plus extra_stopwords being the most frequents words in the corpus AND being meaningless 
extra_stopwords = [
    # Filler Words: Common conversational placeholders without thematic value
    "yeah", "mm", "ah", "oh", "okay", "also", "yes", "dont", "I've", "mean", "has",
    # Vague/Ambiguous Words: Frequent but thematically irrelevant in conversations
    "like", "think", "know", "really", "bit", "maybe", "could", "would", "felt", "feel", "thought", 
    "thinking", "see", "that's", "exactly", "said", "say", "was",
    # Broad terms or context-specific words overshadowing subtler themes
    "thing", "part", "one", "sorry", "first", "second", #"meditation",
]
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, ngrams=1))

# Remove rows with empty content or content that's only punctuation after preprocessing
df = df[df['preprocessed_content'].str.strip().str.len() > 0]
# File S225 is removed because it is empty after preprocessing (only two utterances, not meaningful words for topics)

In [100]:
df = df.groupby(['File Name']).agg({ 
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Content': ' '.join,  # Combine raw text
    'Experiment': 'first',   # Keep the first 
    'Condition': 'first',   # Keep the first
    'Id': 'first',   # Keep the first
}).reset_index()
df

Unnamed: 0,File Name,preprocessed_content,Content,Experiment,Condition,Id
0,ID 05,let's unexpected surprising moment going reali...,"so, that was very, let's say, unexpected and s...",OBE1,1,5
1,Id 08,prefer version feeling third eye opened prefer...,yeah i prefer this version of like. i had the ...,OBE1,1,8
2,Id 13,there's instability uncertainty question inter...,"there's also a bit of instability, uncertainty...",OBE1,I,13
3,Id 13b,characterize nutshell last time incertitude ce...,yeah that's how that's how i characterize it i...,OBE1,C,13
4,Id 14,different body relaxing relaxing time open med...,"yes, different parts of the body, yeah. yeah, ...",OBE1,1,14
...,...,...,...,...,...,...
69,S304,condition kind relaxed real experience conditi...,i really like the first condition. i felt kind...,Compassion,1,304
70,S305con,meditation kind new never meditation stuff i'd...,the meditation? it's kind of new for me. i've ...,Compassion,C,305
71,S306,body front although already strange beginning ...,"the first part, i feel like when i have my own...",Compassion,1,306
72,S307,time virtual reality beginning weird anxious t...,it was my first time with virtual reality so a...,Compassion,1,307


In [101]:
docs = list(df.preprocessed_content)
print(len(docs))

74


In [102]:
# Map experiments to unique integers
experiment_mapping = {experiment: idx for idx, experiment in enumerate(df["Experiment"].unique())}
df["Experiment_Int"] = df["Experiment"].map(experiment_mapping)
print(experiment_mapping)
# Convert to numpy array
y = df["Experiment_Int"].to_numpy()
# Print length
print("Length of y:", len(y))

{'OBE1': 0, 'OBE2': 1, 'Compassion': 2}
Length of y: 74


In [103]:
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder()
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)

topics, probs = topic_model.fit_transform(docs, y=y)

In [104]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,46,0_body_actually_different_kind,"[body, actually, different, kind, back, time, ...",[found easier get state meditation funny float...
1,1,18,1_experience_time_body_quite,"[experience, time, body, quite, little, someth...",[let's unexpected surprising moment going real...
2,2,10,2_leg_eye_focus_sort,"[leg, eye, focus, sort, closed, feeling, easie...",[condition kind relaxed real experience condit...
