In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.analysis_helpers import *
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


# Define the Docs

In [3]:
directory = '../interviews_corrected/6_final/**/' 

df_all = load_and_combine_csv(directory)
df_all = standardize_data(df_all)

print(df_all["Condition"].unique())
# Drop the rows equal to 0 for the column condition
df_all = df_all[df_all["Condition"] != 0]
print(df_all["Condition"].unique())

Found 82 CSV files.
Standardized speaker labels.
Normalized text in 'Content' column.
[1 'C' 'I' 0]
[1 'C' 'I']


In [4]:
# Preoprocessing
# For the moment only focus on the participants answers
df = df_all[df_all["Speaker"] == "Participant"].copy()

# Stop words were removed using the NLTK library of stop words. 
# All text was lowercased + lemmatized
# Plus extra_stopwords being the most frequents words in the corpus AND being meaningless (e.g. keep "body")
extra_stopwords = ["yeah", "like", "think", "know", "dont","yes", "one","okay","mm", "really","bit","could","thats","see","feel","felt"]
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, ngrams=1))

# Remove rows with empty content or content that's only punctuation after preprocessing
df = df[df['preprocessed_content'].str.strip().str.len() > 0]

In [5]:
docs = list(df.preprocessed_content)
print(len(docs))

1971


# Define the model

In [6]:
# ! python -m spacy download en_core_web_sm
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

main_representation_model = KeyBERTInspired()
aspect_representation_model1 = PartOfSpeech("en_core_web_sm")
aspect_representation_model2 = [KeyBERTInspired(top_n_words=30), 
                                MaximalMarginalRelevance(diversity=.5)]

representation_model = {
   "Main": main_representation_model,
   "Aspect1":  aspect_representation_model1,
   "Aspect2":  aspect_representation_model2 
}

# embedding_model = "paraphrase-MiniLM-L6-v2" Modify this to use a different model
#vectorizer_model = CountVectorizer(ngram_range=(1, 2))
topic_model = BERTopic(representation_model = representation_model, verbose=True) #vectorizer_model=vectorizer_model) #min_topic_size=5)

In [7]:
topics, ini_probs = topic_model.fit_transform(docs)
num_topics = len(topic_model.get_topics()) - 1
num_topics

2024-11-25 15:51:42,457 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 62/62 [00:03<00:00, 15.79it/s]
2024-11-25 15:51:48,719 - BERTopic - Embedding - Completed ✓
2024-11-25 15:51:48,719 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-25 15:52:02,668 - BERTopic - Dimensionality - Completed ✓
2024-11-25 15:52:02,668 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-25 15:52:02,738 - BERTopic - Cluster - Completed ✓
2024-11-25 15:52:02,748 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-25 15:52:14,163 - BERTopic - Representation - Completed ✓


58

In [8]:
topic_model.get_topic_info().to_csv("topic_info.csv")
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,Representation,Aspect1,Aspect2,Representative_Docs
0,-1,385,-1_focus_feeling_watching_thought,"[focus, feeling, watching, thought, emotion, s...","[first, front, control, time, body, technical,...","[focus, sitting, scene, selflove, dream, mood,...",[right actually make realise im actually way b...
1,0,96,0_thinking_mind_feeling_thought,"[thinking, mind, feeling, thought, telepathy, ...","[sort, thinking, thought, sensation, relate, m...","[thinking, telepathy, mindbody, introspective,...",[sort situation sort become introspective im i...
2,1,76,1_body_seeing_consciousness_soul,"[body, seeing, consciousness, soul, vision, mi...","[body, mirror, representation, empty, middle, ...","[body, consciousness, vision, mirror, staring,...","[seeing body, seeing body, seeing body mirror ..."
3,2,73,2_meditation_meditate_meditating_mindfulness,"[meditation, meditate, meditating, mindfulness...","[meditation, meditating, journey, travel, comp...","[meditation, yoga, practice, compassionate, gu...","[meditation, meditation, meditation]"
4,3,73,3_distracted_focus_focused_focusing,"[distracted, focus, focused, focusing, distrac...","[focus, distracted, concentrate, leg, focused,...","[distracted, focusing, concentrate, blurry, me...","[got distracted, eye closed got distracted wou..."
5,4,62,4_outside_surrounding_inside_scenery,"[outside, surrounding, inside, scenery, beach,...","[environment, room, place, mountain, nature, h...","[outside, scenery, beach, surroundings, nature...","[im outside im outside body, outside, outside]"
6,5,54,5_eye_closedcircuit_closing_staring,"[eye, closedcircuit, closing, staring, closed,...","[eye, closed, open, close, closing, harder, ea...","[closedcircuit, stare, focused, screen, easier...","[first part eye closed, first eye closed time,..."
7,6,51,6_sound_audio_hearing_voice,"[sound, audio, hearing, voice, noise, listenin...","[voice, sound, follow, bird, instruction, guid...","[hearing, voice, whispering, bird, instruction...",[bird sound forest reminding bird sound home w...
8,7,48,7_touch_touched_touching_tapping,"[touch, touched, touching, tapping, back, body...","[touch, touching, body, image, delay, real, sc...","[touch, jump, reacted, experienced, delay, sme...","[touch, touched back also touching back, touch..."
9,8,46,8_reality_vr_illusion_virtual,"[reality, vr, illusion, virtual, imagination, ...","[reality, virtual, illusion, kind, dress, phys...","[reality, vr, illusion, immersive, perspective...",[sophisticated virtual reality stuff people 10...


In [9]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

In [None]:
#topic_model.visualize_heatmap(n_clusters=9, width=1000, height=1000)

In [None]:
topic_model.visualize_topics()

In [None]:
#topic_model.visualize_documents(docs)

In [16]:
topic_model.visualize_hierarchy()


In [10]:
df['basic_topic'] = topics

# Topic Distribution (More than one topic per docs) 
To reduce the numbers of outliers

In [11]:
def get_topic_stats(topic_model, extra_cols = []):
    topics_info_df = topic_model.get_topic_info().sort_values('Count', ascending = False)
    topics_info_df['Share'] = 100.*topics_info_df['Count']/topics_info_df['Count'].sum()
    topics_info_df['CumulativeShare'] = 100.*topics_info_df['Count'].cumsum()/topics_info_df['Count'].sum()
    return topics_info_df[['Topic', 'Count', 'Share', 'CumulativeShare', 
                           'Name', 'Representation'] + extra_cols]

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append(
            {
                'topic1': t1, 
                'topic2': t2, 
                'distance': rec[t2]
            }
        )

pair_dist_df = pd.DataFrame(tmp)

pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(
      lambda x: not x.startswith('-1'))) & 
            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending = False).head(20)

Unnamed: 0,topic1,topic2,distance
79,0_thinking_mind_feeling_thought,19_moment_imagine_mean_dreaming,0.717763
1453,23_nice_great_good_cute,36_fine_ok_good_able,0.692685
95,0_thinking_mind_feeling_thought,35_guess_reaction_clue_case,0.687575
966,15_question_answer_understand_understood,21_say_told_said_tell,0.676203
117,0_thinking_mind_feeling_thought,57_surroundings_attention_mind_observe,0.67097
1466,23_nice_great_good_cute,49_cool_nice__,0.662225
61,0_thinking_mind_feeling_thought,1_body_seeing_consciousness_soul,0.660022
68,0_thinking_mind_feeling_thought,8_reality_vr_illusion_virtual,0.659438
2230,36_fine_ok_good_able,46_im_youre_na_gon,0.655459
702,10_second_third_first_previous,52_difference_compare_faster_first,0.654287


In [13]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 5, calculate_tokens=True)

100%|██████████| 2/2 [00:00<00:00, 16.17it/s]


In [14]:
import tqdm
import plotly.express as px

tmp_dfs = []

# iterating through different threshold levels
for thr in tqdm.tqdm(np.arange(0, 0.35, 0.001)):
    # calculating number of topics with probability > threshold for each document
    tmp_df = pd.DataFrame(list(map(lambda x: len(list(filter(lambda y: y >= thr, x))), topic_distr))).rename(
        columns = {0: 'num_topics'}
    )
    tmp_df['num_docs'] = 1
    
    tmp_df['num_topics_group'] = tmp_df['num_topics']\
        .map(lambda x: str(x) if x < 5 else '5+')
    
    # aggregating stats
    tmp_df_aggr = tmp_df.groupby('num_topics_group', as_index = False).num_docs.sum()
    tmp_df_aggr['threshold'] = thr
    
    tmp_dfs.append(tmp_df_aggr)

num_topics_stats_df = pd.concat(tmp_dfs).pivot(index = 'threshold', 
                              values = 'num_docs',
                              columns = 'num_topics_group').fillna(0)

num_topics_stats_df = num_topics_stats_df.apply(lambda x: 100.*x/num_topics_stats_df.sum(axis = 1))

# visualisation
colormap = px.colors.sequential.YlGnBu
px.area(num_topics_stats_df, 
       title = 'Distribution of number of topics',
       labels = {'num_topics_group': 'number of topics',
                'value': 'share of reviews, %'},
       color_discrete_map = {
          '0': colormap[0],
          '1': colormap[3],
          '2': colormap[4],
          '3': colormap[5],
          '4': colormap[6],
          '5+': colormap[7]
      })

100%|██████████| 350/350 [00:07<00:00, 49.46it/s]


In [28]:
threshold = 0.35

# define topic with probability > 0.13 for each document
df['multiple_topics'] = list(map(
    lambda doc_topic_distr: list(map(
        lambda y: y[0], filter(lambda x: x[1] >= threshold, 
                               (enumerate(doc_topic_distr)))
    )), topic_distr
))

# creating a dataset with docid, topic
tmp_data = []

for rec in df.to_dict('records'):
    if len(rec['multiple_topics']) != 0:
        mult_topics = rec['multiple_topics']
    else:
        mult_topics = [-1]
        
    for topic in mult_topics: 
        tmp_data.append(
            {
                'topic': topic,
                'id': rec['Content'],
            }
        )
            
mult_topics_df = pd.DataFrame(tmp_data)

In [None]:
# add topic names
mult_topics_df = mult_topics_df.merge(
    topic_model.get_topic_info().rename(columns = {'Topic': 'topic'}),
    on = 'topic', how = 'left'
)

mult_topics_df = mult_topics_df[['topic', 'id', 'Name']]
mult_topics_df

Unnamed: 0,topic,Count,id,Name
0,2,73,the points i think are like meditating.,2_meditation_meditate_meditating_mindfulness
1,38,15,i had some the gear on me and that was a littl...,38_little_tiny_smaller_less
2,8,46,and also while i had my eyes closed i could di...,8_reality_vr_illusion_virtual
3,-1,385,but on the other side while i was watching the...,-1_focus_feeling_watching_thought
4,4,62,but it was still staying in this room and it w...,4_outside_surrounding_inside_scenery
...,...,...,...,...
2251,2,73,to be honest when i was out of my body i was m...,2_meditation_meditate_meditating_mindfulness
2252,9,43,yeah so it was a better meditating experience.,9_experience_experienced_experientially_learned
2253,3,73,"probably yeah, i think i was less concentrate ...",3_distracted_focus_focused_focusing
2254,14,33,what was the question about like having three ...,14_three_threebody_third_two


In [30]:
# count the number of documents per topic
mult_topics_df['Count'] = 1
mult_topics_df.groupby(['topic', 'Name'], as_index = False).Count.sum()

Unnamed: 0,topic,Name,Count
0,-1,-1_focus_feeling_watching_thought,293
1,0,0_thinking_mind_feeling_thought,66
2,1,1_body_seeing_consciousness_soul,59
3,2,2_meditation_meditate_meditating_mindfulness,48
4,3,3_distracted_focus_focused_focusing,70
5,4,4_outside_surrounding_inside_scenery,53
6,5,5_eye_closedcircuit_closing_staring,51
7,6,6_sound_audio_hearing_voice,52
8,7,7_touch_touched_touching_tapping,44
9,8,8_reality_vr_illusion_virtual,43


In [34]:
# see the docs from one specific topic
topic_id = 1
mult_topics_df[mult_topics_df.topic == topic_id].to_csv(f"topic_{topic_id}.csv")

# Compute the porportion of topics per participants

In [None]:
# Assign topics back to the DataFrame
df['Topic'] = topics
df

In [None]:
topic_counts = df['Topic'].value_counts().reset_index()
topic_counts.columns = ['Topic', 'Count']

total_segments = len(df)
topic_counts['Proportion'] = topic_counts['Count'] / total_segments

In [None]:
# Group by 'Id' and 'Topic' to calculate the counts
participant_topic_counts = df.groupby(['Id', 'Topic']).size().reset_index(name='Count')

# Calculate proportion of each topic for each participant using transform
participant_topic_counts['Proportion'] = (
    participant_topic_counts.groupby('Id')['Count'].transform(lambda x: x / x.sum())
)