In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.analysis_helpers import *
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
directory = '../interviews_corrected/6_final/**/' 

df_all = load_and_combine_csv(directory)
df_all = standardize_data(df_all)

print(df_all["Condition"].unique())
# Drop the rows equal to 0 for the column condition
df_all = df_all[df_all["Condition"] != 0]
print(df_all["Condition"].unique())

Found 82 CSV files.
Standardized speaker labels.
Normalized text in 'Content' column.
[1 'C' 'I' 0]
[1 'C' 'I']


In [4]:
df = df_all[df_all["Speaker"] == "Participant"].copy()
#df = df[df["Experiment"] == "Compassion"]

extra_stopwords = ['yeah', 'yes', 'like', 'you', 'know','really', 'okay','mm', 'one','mean','first']

# Preprocess the text
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, ngrams=1))

"""grouped_df = df.groupby(['File Name','utterance_index']).agg({
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Content': ' '.join,  # Combine raw text
    'Experiment': 'first',   # Keep the first (consistent if File Name is unique)         
}).reset_index()
grouped_df"""

"grouped_df = df.groupby(['File Name','utterance_index']).agg({\n    'preprocessed_content': ' '.join,  # Combine preprocessed text\n    'Content': ' '.join,  # Combine raw text\n    'Experiment': 'first',   # Keep the first (consistent if File Name is unique)         \n}).reset_index()\ngrouped_df"

## Main topic

In [5]:
docs = list(df.Content)
print(len(docs))

2225


In [6]:
# Hyperparameters
vectorizer_model = CountVectorizer(ngram_range=(1, 2))


topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose=True)
topics, ini_probs = topic_model.fit_transform(docs)
num_topics = len(topic_model.get_topics()) - 1
num_topics

2024-11-21 13:53:27,652 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 70/70 [00:14<00:00,  4.95it/s]
2024-11-21 13:53:44,214 - BERTopic - Embedding - Completed ✓
2024-11-21 13:53:44,215 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-21 13:54:04,623 - BERTopic - Dimensionality - Completed ✓
2024-11-21 13:54:04,623 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-21 13:54:04,753 - BERTopic - Cluster - Completed ✓
2024-11-21 13:54:04,762 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-21 13:54:04,946 - BERTopic - Representation - Completed ✓


48

In [7]:
topic_model.get_topic_info().to_csv("all.csv")
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,746,-1_like_to_the_it,"[like, to, the, it, and, was, you, of, that, but]","[and in the second task, in the second meditat..."
1,0,129,0_body_my body_my_felt,"[body, my body, my, felt, that, was, like, of,...",[i knew it was my body but i just imagined it ...
2,1,87,1_forest_the forest_the_in,"[forest, the forest, the, in, environment, in ...","[it was so nice to be in the forest., i felt l..."
3,2,63,2_yeah yeah_yeah_so yeah_yeah so,"[yeah yeah, yeah, so yeah, yeah so, yeah right...","[yeah, yeah., yeah, yeah., yeah, yeah.]"
4,3,62,3_focus_about_on_distracted,"[focus, about, on, distracted, focusing, focus...",[that at some point i forget about my body and...


In [8]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

In [64]:
topic_model.visualize_heatmap(n_clusters=9, width=1000, height=1000)

In [65]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_documents(docs)

In [12]:
topics_per_class = topic_model.topics_per_class(docs, classes=df.Experiment)
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10,normalize_frequency = True)

3it [00:00, 10.24it/s]


## Play with Hyperparametes

In [13]:
docs = list(df_all.Content)
print(len(docs))

4031


In [14]:
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

main_representation_model = KeyBERTInspired()
aspect_representation_model1 = PartOfSpeech("en_core_web_sm")
aspect_representation_model2 = [KeyBERTInspired(top_n_words=30), 
                                MaximalMarginalRelevance(diversity=.5)]

representation_model = {
   "Main": main_representation_model,
   "Aspect1":  aspect_representation_model1,
   "Aspect2":  aspect_representation_model2 
}

vectorizer_model = CountVectorizer(min_df=5, stop_words = 'english', ngram_range=(1, 2))

topic_model = BERTopic(nr_topics = 'auto', 
                      vectorizer_model = vectorizer_model,
                      representation_model = representation_model)

topics, ini_probs = topic_model.fit_transform(docs)

In [15]:
def get_topic_stats(topic_model, extra_cols = []):
    topics_info_df = topic_model.get_topic_info().sort_values('Count', ascending = False)
    topics_info_df['Share'] = 100.*topics_info_df['Count']/topics_info_df['Count'].sum()
    topics_info_df['CumulativeShare'] = 100.*topics_info_df['Count'].cumsum()/topics_info_df['Count'].sum()
    return topics_info_df[['Topic', 'Count', 'Share', 'CumulativeShare', 
                           'Name', 'Representation'] + extra_cols]

get_topic_stats(topic_model, ['Aspect1', 'Aspect2']).head(10)\
    .set_index('Topic')

Unnamed: 0_level_0,Count,Share,CumulativeShare,Name,Representation,Aspect1,Aspect2
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,1271,31.530638,31.530638,-1_felt like_felt_feel_feel like,"[felt like, felt, feel, feel like, feeling, se...","[body, bit, things, kind, feeling, thing, litt...","[felt, sensation, feeling like, experience, th..."
0,471,11.684446,43.215083,0_experience_felt like_felt really_did feel,"[experience, felt like, felt really, did feel,...","[experience, body, nice, feeling, cool, real, ...","[experience, felt like, felt, feel body, like ..."
1,351,8.707517,51.9226,1_okay okay_okay_okay yeah_yeah okay,"[okay okay, okay, okay yeah, yeah okay, okay j...","[okay, fine, course, sorry, good, sure, new, g...","[okay im, yes okay, fine, like okay, sure, oka..."
2,114,2.828082,54.750682,2_meditation_felt_like feeling_sensation,"[meditation, felt, like feeling, sensation, wa...","[meditation, feeling, normal, experience, task...","[meditation, sensation, like feel, body, effec..."
3,102,2.530389,57.281072,3_questions_ask_asked_okay just,"[questions, ask, asked, okay just, okay maybe,...","[questions, question, small, thoughts, great, ...","[ask, okay maybe, great okay, feedback, talk, ..."
4,83,2.059042,59.340114,4_eyes closed_close eyes_eyes_closed,"[eyes closed, close eyes, eyes, closed, focus,...","[eyes, closed, open, close, lot, condition, ti...","[eyes closed, focus, relax, meditation, though..."
5,82,2.034235,61.374349,5_thinking_like think_thoughts_mind,"[thinking, like think, thoughts, mind, think, ...","[focus, mind, stuff, lot, image, beginning, mo...","[thinking, like think, relax, focus, concentra..."
6,78,1.935004,63.309353,6_thats nice_nice okay_nice_great okay,"[thats nice, nice okay, nice, great okay, yeah...","[nice, great, good, interesting, cool, positiv...","[thats nice, nice okay, yeah great, okay yeah,..."
7,72,1.786157,65.09551,7_felt_like body_body body_scene,"[felt, like body, body body, scene, body like,...","[scene, image, real, body, end, normal, sessio...","[felt, scene, body like, normal, feeling like,..."
8,71,1.76135,66.856859,8_different_bit different_difference_changes,"[different, bit different, difference, changes...","[different, difference, bit, sort, life, posit...","[bit different, changes, like quite, kind like..."


In [16]:
num_topics = len(topic_model.get_topics()) - 1
num_topics

63

In [17]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append(
            {
                'topic1': t1, 
                'topic2': t2, 
                'distance': rec[t2]
            }
        )

pair_dist_df = pd.DataFrame(tmp)

pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(
      lambda x: not x.startswith('-1'))) & 
            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending = False).head(20)

Unnamed: 0,topic1,topic2,distance
790,11_second_second time_difference_scene,21_second time_second session_second_yeah second,0.71431
858,12_just general_just like_general_like yeah,25_explain_understand_understood_tell,0.710718
2540,38_color_colors_different_bit different,43_light_colors_color_did notice,0.677473
1073,15_relaxed_relax_calm_feel like,48_calm_relax_okay_felt,0.673938
109,0_experience_felt like_felt really_did feel,44_yeah felt_feel like_felt like_felt,0.672791
74,0_experience_felt like_felt really_did feel,9_strange_bit strange_weird_normal,0.67028
72,0_experience_felt like_felt really_did feel,7_felt_like body_body body_scene,0.663319
135,1_okay okay_okay_okay yeah_yeah okay,6_thats nice_nice okay_nice_great okay,0.659395
586,8_different_bit different_difference_changes,9_strange_bit strange_weird_normal,0.654075
176,1_okay okay_okay_okay yeah_yeah okay,47____,0.651914


In [19]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 4, calculate_tokens=True)

In [20]:
topic_distr

array([[0.10487924, 0.        , 0.        , ..., 0.07352498, 0.06612508,
        0.        ],
       [0.18705952, 0.        , 0.        , ..., 0.03906357, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.17757246, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [21]:
topic_model.visualize_distribution(topic_distr[5], min_probability=0.05)

## Topic ditributions (Spilt each doc into tokens)

In [22]:
df_all.head(1)

Unnamed: 0,Experiment,File Name,Id,Start Time,End Time,Speaker,Content,Condition,Order Condition,utterance_index
0,Compassion,S301final,301,"00:00:00,060","00:00:12,100",Interviewer,and you can just a little bit tell about your ...,1,Unknown,0


In [23]:
df = df_all#[df_all["Speaker"] == "Participant"].copy()
#df = df[df["Experiment"] == "Compassion"]

extra_stopwords = {'yeah', 'yes', 'like', 'you', 'know', 'um', 'uh', 'really', 'okay','mm'}
# Preprocess the text
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, ngrams=1))

grouped_df = df.groupby(['File Name','utterance_index']).agg({
    'preprocessed_content': ' '.join,  # Combine preprocessed text
    'Content': ' '.join,  # Combine raw text
    'Experiment': 'first',   # Keep the first (consistent if File Name is unique)         
}).reset_index()

In [24]:
docs = list(grouped_df.Content)
print(len(docs))

1776


In [25]:
topic_model = BERTopic().fit(docs)
num_topics = len(topic_model.get_topics()) - 1
num_topics

36

In [26]:
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,716,-1_it_the_was_like,"[it, the, was, like, and, you, to, of, that, but]","[the first time i was suprise, because i din't..."
1,0,77,0_yeah_exactly_course_yes,"[yeah, exactly, course, yes, absolutely, missi...","[yeah., yeah., yeah.]"
2,1,76,1_yourself_you_where_body,"[yourself, you, where, body, did, saw, outside...",[i mean it's strange experience. so it's fine ...
3,2,56,2_okay_mm_ok_confused,"[okay, mm, ok, confused, perfect, all, thats, ...","[okay., okay., okay.]"
4,3,49,3_you_feel_back_when,"[you, feel, back, when, go, out, body, is, sep...",[think it's. i don't know how to feel about it...
5,4,49,4_questions_okay_any_have,"[questions, okay, any, have, other, want, thou...",[do you have any questions? do you have any qu...
6,5,42,5_okay_nice_thats_yeah,"[okay, nice, thats, yeah, ah, oh, yes, great, ...","[okay. okay. okay., okay, okay., okay yeah okay.]"
7,6,42,6_blinking_particles_those_changing,"[blinking, particles, those, changing, changes...","[yes, and those particles, like, did they get ..."
8,7,40,7_delay_time_without_was,"[delay, time, without, was, moving, it, there,...",[there was a bit of a delay like there was a h...
9,8,39,8_focus_my_like_on,"[focus, my, like, on, was, legs, to, but, and,...","[yeah, i think it's because i was really, real..."


In [27]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 5, calculate_tokens=True)

In [28]:
topic_model.visualize_distribution(topic_distr[1])

In [29]:
doc_id = 9

In [30]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])
df


Unnamed: 0,so,felt,like,time,lag,in,the,last,one,of,what,was,experiencing,and,what.1,was.1,seeing,but,never,felt.1,like.1,had,three,bodies
1_yourself_you_where_body,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3_you_feel_back_when,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126,0.126,0.126,0.126,0.0,0.114,0.114,0.114,0.114,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6_blinking_particles_those_changing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116,0.116,0.116,0.116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7_delay_time_without_was,0.21,0.335,0.467,0.597,0.387,0.262,0.13,0.0,0.119,0.119,0.119,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8_focus_my_like_on,0.135,0.135,0.135,0.135,0.0,0.0,0.0,0.0,0.152,0.152,0.152,0.152,0.0,0.146,0.303,0.414,0.517,0.482,0.325,0.214,0.111,0.0,0.0,0.0
10_meditation_how_did_the,0.128,0.128,0.128,0.128,0.0,0.116,0.226,0.334,0.466,0.351,0.241,0.132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12_forest_the_in_very,0.0,0.0,0.0,0.0,0.0,0.102,0.102,0.102,0.225,0.123,0.123,0.123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13_eyes_open_closed_close,0.117,0.117,0.117,0.117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14_how_experience_was_session,0.118,0.118,0.118,0.118,0.0,0.0,0.0,0.0,0.166,0.166,0.166,0.166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16_easier_difficult_do_to,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147,0.147,0.147,0.147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
