In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils.analysis_helpers import *
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
directory = '../interviews_corrected/6_final/**/' 

df_all = load_and_combine_csv(directory)
df_all = standardize_data(df_all)

print(df_all["Condition"].unique())
# Drop the rows equal to 0 for the column condition
df_all = df_all[df_all["Condition"] != 0]
print(df_all["Condition"].unique())

Found 82 CSV files.
Standardized speaker labels.
Normalized text in 'Content' column.
[1 'C' 'I' 0]
[1 'C' 'I']


In [5]:
df = df_all[df_all["Speaker"] == "Participant"].copy()
#df = df[df["Experiment"] == "Compassion"]

extra_stopwords = {'yeah', 'yes', 'like', 'you', 'know', 'um', 'uh', 'really', 'okay','mm'}
# Preprocess the text
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, ngrams=1))

grouped_df = df.groupby(['File Name','utterance_index']).agg({
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Content': ' '.join,  # Combine raw text
    'Experiment': 'first',   # Keep the first (consistent if File Name is unique)         
}).reset_index()

grouped_df

Unnamed: 0,File Name,utterance_index,preprocessed_content,Content,Experiment
0,ID 05,1,let say unexpected surprising moment feel your...,"so, that was very, let's say, unexpected and s...",OBE1
1,ID 05,3,little bit little bit well im watching outside...,"a little bit. it was a little bit like, okay, ...",OBE1
2,ID 05,5,feel im back im sure ive got perfect descripti...,yeah yeah i feel like i'm back it's okay but y...,OBE1
3,ID 05,7,thing didnt understand perhaps understood end ...,"the thing that i didn't really understand, but...",OBE1
4,ID 05,9,felt time lag last one experiencing seeing nev...,so i felt like a time lag in the last one of w...,OBE1
...,...,...,...,...,...
830,S313,30,think helped help focus see talk raise hand se...,i think it helped. it helps you focus because ...,Compassion
831,S313,32,didnt even notice,no i didn't even notice.,Compassion
832,S313,34,definitely felt didnt lot sensation floating b...,i definitely felt like i didn't have a lot of ...,Compassion
833,S313,36,place,no at my place.,Compassion


## Main topic

In [6]:
docs = list(df.preprocessed_content)
print(len(docs))

2225


In [7]:
# Hyperparameters
vectorizer_model = CountVectorizer(ngram_range=(1, 2))


topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True)
topics, ini_probs = topic_model.fit_transform(docs)
num_topics = len(topic_model.get_topics()) - 1
num_topics

2024-11-21 00:21:10,885 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 70/70 [00:05<00:00, 13.61it/s]
2024-11-21 00:21:18,286 - BERTopic - Embedding - Completed ✓
2024-11-21 00:21:18,291 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-21 00:21:35,206 - BERTopic - Dimensionality - Completed ✓
2024-11-21 00:21:35,206 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-21 00:21:35,327 - BERTopic - Cluster - Completed ✓
2024-11-21 00:21:35,338 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-21 00:21:35,446 - BERTopic - Representation - Completed ✓


59

In [12]:
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,552,-1_feeling_felt_feel_body,"[feeling, felt, feel, body, something, could, ...",[second part dont dont dont dont feel little b...
1,0,167,0_dint___,"[dint, , , , , , , , , ]","[, , ]"
2,1,71,1_meditation_meditation meditation_journey_med...,"[meditation, meditation meditation, journey, m...","[meditation, meditation, meditation]"
3,2,67,2_think_think think_thinking_thought,"[think, think think, thinking, thought, though...","[think, think, think]"
4,3,59,3_focus_focusing_distracted_concentrate,"[focus, focusing, distracted, concentrate, foc...",[try medidate always noticed something brain n...
5,4,56,4_eye_closed_eye closed_open,"[eye, closed, eye closed, open, eye open, clos...","[first part eye closed, mean felt real eye clo..."
6,5,54,5_one_second one_second_first one,"[one, second one, second, first one, first, on...","[first one, first one, second one second one]"
7,6,52,6_mirror_body_seeing_seeing body,"[mirror, body, seeing, seeing body, see, front...","[seeing body, seeing body, seeing body mirror ..."
8,7,51,7_outside_environment_go_room,"[outside, environment, go, room, mountain, im ...","[im outside, one day train travel lot see see ..."
9,8,44,8_hand_move_moved_platform,"[hand, move, moved, platform, move hand, movem...",[move hand dont move hand see form another vie...


In [13]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

In [14]:
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [15]:
topic_model.visualize_topics()

In [16]:
topic_model.visualize_documents(docs)

In [17]:
topics_per_class = topic_model.topics_per_class(docs, classes=df.Experiment)
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10,normalize_frequency = True)

3it [00:00, 18.01it/s]


## Play with Hyperparametes

In [76]:
docs = list(df_all.Content)
print(len(docs))

4031


In [None]:
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

main_representation_model = KeyBERTInspired()
aspect_representation_model1 = PartOfSpeech("en_core_web_sm")
aspect_representation_model2 = [KeyBERTInspired(top_n_words=30), 
                                MaximalMarginalRelevance(diversity=.5)]

representation_model = {
   "Main": main_representation_model,
   "Aspect1":  aspect_representation_model1,
   "Aspect2":  aspect_representation_model2 
}

vectorizer_model = CountVectorizer(min_df=5, stop_words = 'english', ngram_range=(1, 2))

topic_model = BERTopic(nr_topics = 'auto', 
                      vectorizer_model = vectorizer_model,
                      representation_model = representation_model)

topics, ini_probs = topic_model.fit_transform(docs)

In [78]:
def get_topic_stats(topic_model, extra_cols = []):
    topics_info_df = topic_model.get_topic_info().sort_values('Count', ascending = False)
    topics_info_df['Share'] = 100.*topics_info_df['Count']/topics_info_df['Count'].sum()
    topics_info_df['CumulativeShare'] = 100.*topics_info_df['Count'].cumsum()/topics_info_df['Count'].sum()
    return topics_info_df[['Topic', 'Count', 'Share', 'CumulativeShare', 
                           'Name', 'Representation'] + extra_cols]

get_topic_stats(topic_model, ['Aspect1', 'Aspect2']).head(10)\
    .set_index('Topic')

Unnamed: 0_level_0,Count,Share,CumulativeShare,Name,Representation,Aspect1,Aspect2
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,1335,33.118333,33.118333,-1_felt like_felt_feel like_feel,"[felt like, felt, feel like, feel, feeling, se...","[body, eyes, feeling, kind, real, things, bit,...","[felt, feel like, sensation, feelings, experie..."
0,1042,25.849665,58.967998,0_felt like_feel like_felt_feeling,"[felt like, feel like, felt, feeling, did feel...","[body, experience, nice, bit, interesting, str...","[feel like, felt, sensation, experience, body,..."
1,382,9.476557,68.444555,1_okay okay_okay_okay yeah_okay just,"[okay okay, okay, okay yeah, okay just, yeah o...","[general, course, way, example, thing, , , , , ]","[okay okay, okay just, okay dont, great okay, ..."
2,189,4.688663,73.133218,2_meditation_relaxed_mind_felt,"[meditation, relaxed, mind, felt, relax, think...","[meditation, relaxed, experience, different, v...","[meditation, mind, practice, body like, scene,..."
3,87,2.158273,75.291491,3_thinking_like think_thoughts_mind,"[thinking, like think, thoughts, mind, think, ...","[focus, mind, brain, stuff, head, lot, able, i...","[thinking, like think, relax, yeah maybe, focu..."
4,68,1.686926,76.978417,4_questions_ask_asked_question,"[questions, ask, asked, question, things, answ...","[questions, question, great, thoughts, clear, ...","[ask, things, talk, great okay, let know, dont..."
5,63,1.562888,78.541305,5_moving_hand_felt_felt like,"[moving, hand, felt, felt like, feel like, try...","[hand, time, times, way, real, place, fact, st...","[moving, hand, felt, strange, like time, obser..."
6,49,1.215579,79.756884,6_second_second time_scene_difference,"[second, second time, scene, difference, diffe...","[second, difference, time, easier, parts, view...","[second time, scene, focused, time, easier, vi..."
7,44,1.091541,80.848425,7_reality_focus_thinking_brain,"[reality, focus, thinking, brain, meditation, ...","[reality, room, kind, people, things, focus, b...","[focus, brain, meditation, experience, like th..."
8,39,0.967502,81.815927,8_dont know_dont think_im sure_ask,"[dont know, dont think, im sure, ask, question...","[sure, small, question, , , , , , , ]","[dont know, dont think, im sure, ask, question..."


In [79]:
num_topics = len(topic_model.get_topics()) - 1
num_topics

45

In [81]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

In [80]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append(
            {
                'topic1': t1, 
                'topic2': t2, 
                'distance': rec[t2]
            }
        )

pair_dist_df = pd.DataFrame(tmp)

pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(
      lambda x: not x.startswith('-1'))) & 
            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending = False).head(20)

Unnamed: 0,topic1,topic2,distance
169,2_meditation_relaxed_mind_felt,30_meditation_feelings_self_feeling,0.700752
49,0_felt like_feel like_felt_feeling,2_meditation_relaxed_mind_felt,0.667365
50,0_felt like_feel like_felt_feeling,3_thinking_like think_thoughts_mind,0.665126
52,0_felt like_feel like_felt_feeling,5_moving_hand_felt_felt like,0.653609
1617,34_second time_yeah second_second_times,6_second_second time_scene_difference,0.649071
142,2_meditation_relaxed_mind_felt,3_thinking_like think_thoughts_mind,0.646431
54,0_felt like_feel like_felt_feeling,7_reality_focus_thinking_brain,0.645557
1293,27_questions_ask_okay just_okay maybe,4_questions_ask_asked_question,0.642726
81,0_felt like_feel like_felt_feeling,34_second time_yeah second_second_times,0.63387
61,0_felt like_feel like_felt_feeling,14_felt like_did feel_relaxed_felt,0.619168


In [20]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 4, calculate_tokens=True)

In [25]:
topic_distr

array([[0.09886111, 0.09305857, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.27191272, 0.16671716, 0.        , ..., 0.        , 0.        ,
        0.29846385],
       [0.35234395, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.16145042, 0.09054749, 0.        , ..., 0.        , 0.        ,
        0.11348758],
       [0.51123825, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.28741406, 0.1986851 , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [27]:
topic_model.visualize_distribution(topic_distr[5], min_probability=0.05)

## Topic ditributions (Spilt each doc into tokens)

In [66]:
df_all.head(1)

Unnamed: 0,Experiment,File Name,Id,Start Time,End Time,Speaker,Content,Condition,Order Condition,utterance_index,preprocessed_content
0,Compassion,S301final,301,"00:00:00,060","00:00:12,100",Interviewer,and you can just a little bit tell about your ...,1,Unknown,0,little bit tell experience twosection feel cou...


In [68]:
df = df_all#[df_all["Speaker"] == "Participant"].copy()
#df = df[df["Experiment"] == "Compassion"]

extra_stopwords = {'yeah', 'yes', 'like', 'you', 'know', 'um', 'uh', 'really', 'okay','mm'}
# Preprocess the text
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, ngrams=1))

grouped_df = df.groupby(['File Name','utterance_index']).agg({
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Content': ' '.join,  # Combine raw text
    'Experiment': 'first',   # Keep the first (consistent if File Name is unique)         
}).reset_index()

In [69]:
docs = list(grouped_df.Content)
print(len(docs))

1776


In [70]:
topic_model = BERTopic().fit(docs)
num_topics = len(topic_model.get_topics()) - 1
num_topics

41

In [71]:
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,729,-1_it_like_the_was,"[it, like, the, was, of, you, and, to, that, but]",[yeah yeah it was. it was like a real memory b...
1,0,59,0_where_yourself_you_did,"[where, yourself, you, did, outside, saw, body...",[i mean it's strange experience. so it's fine ...
2,1,56,1_okay_mm_ok_confused,"[okay, mm, ok, confused, perfect, all, thats, ...","[okay., okay., okay.]"
3,2,49,2_questions_okay_any_other,"[questions, okay, any, other, have, ask, want,...","[do you have any questions?, do you maybe have..."
4,3,45,3_yeah_subtle_definitely_ah,"[yeah, subtle, definitely, ah, right, follow, ...","[yeah., yeah., yeah.]"
5,4,42,4_meditation_how_the_did,"[meditation, how, the, did, during, was, or, i...",[so how was this meditation? this experience m...
6,5,42,5_blinking_particles_those_changing,"[blinking, particles, those, changing, pixel, ...","[yes, and those particles, like, did they get ..."
7,6,39,6_yes_no__,"[yes, no, , , , , , , , ]","[yes., yes., yes.]"
8,7,35,7_forest_the_in_very,"[forest, the, in, very, some, real, it, bit, w...","[okay, that's very good. what did you like abo..."
9,8,35,8_color_did_notice_changing,"[color, did, notice, changing, colors, darker,...","[it was nice. i mean, i kept focusing on the c..."


In [72]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 5, calculate_tokens=True)

In [73]:
topic_model.visualize_distribution(topic_distr[1])

In [74]:
doc_id = 9

In [75]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])
df


Unnamed: 0,so,felt,like,time,lag,in,the,last,one,of,what,was,experiencing,and,what.1,was.1,seeing,but,never,felt.1,like.1,had,three,bodies
4_meditation_how_the_did,0.136,0.136,0.136,0.136,0.0,0.132,0.252,0.365,0.516,0.384,0.265,0.151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_blinking_particles_those_changing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114,0.114,0.114,0.114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7_forest_the_in_very,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.12,0.12,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8_color_did_notice_changing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9_eyes_open_closed_close,0.106,0.106,0.106,0.106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11_easier_difficult_do_to,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153,0.153,0.153,0.153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13_how_experience_was_session,0.109,0.109,0.109,0.109,0.0,0.0,0.0,0.0,0.134,0.134,0.134,0.134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14_my_focus_on_like,0.126,0.126,0.126,0.126,0.0,0.0,0.0,0.0,0.126,0.126,0.126,0.126,0.0,0.119,0.247,0.247,0.247,0.234,0.106,0.106,0.106,0.0,0.0,0.0
15_one_second_first_100,0.0,0.0,0.0,0.0,0.0,0.347,0.667,0.96,1.273,0.926,0.606,0.313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.328,0.328,0.328,0.147
19_delay_moving_was_there,0.153,0.257,0.375,0.489,0.336,0.231,0.114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
