In [None]:
%load_ext autoreload
%autoreload 2

In [243]:
from utils.analysis_helpers import *
from bertopic import BERTopic

# ! python -m spacy download en_core_web_sm
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

# Define the Docs

In [None]:
directory = '../interviews_corrected/6_final/**/' 

df_all = load_and_combine_csv(directory)
df_all = standardize_data(df_all)

print(f"Unique conditions before filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews before filtering: {df_all['File Name'].nunique()}")
# *0*: No "real" interview (e.g., setup phase, small talk). We filter these out.
df_all = df_all[df_all["Condition"] != 0]
print(f"Unique conditions after filtering: {df_all['Condition'].unique()}")
print(f"Number of interviews (File Name) after filtering: {df_all['File Name'].nunique()}")

In [317]:
# Preoprocessing
# For the moment only focus on the participants answers
df = df_all[df_all["Speaker"] == "Participant"].copy()
# select only one experiment
df = df[df["Experiment"] == "OBE2"]

# Stop words were removed using the NLTK library of stop words. 
# All text was lowercased + lemmatized
# Plus extra_stopwords being the most frequents words in the corpus AND being meaningless (e.g. keep "body")
extra_stopwords = ["yeah","like","think","know","dont","yes","okay","mm""really","bit","could","that's","exactly",
                   "see","feel","felt","ah","oh","also","I've","maybe","was","thinking","thought","really","thing","part","would","said",
                   "one","first","second","meditation"]
df['preprocessed_content'] = df['Content'].apply(lambda x: preprocess_text(x, extra_stopwords=extra_stopwords, ngrams=1))

# Remove rows with empty content or content that's only punctuation after preprocessing
df = df[df['preprocessed_content'].str.strip().str.len() > 0]
# File S225 is removed because it is empty after preprocessing (only two utterances, not meaningful words for topics)

In [None]:
df.head(1)

In [319]:
# df = df.groupby(['File Name',"utterance_index"]).agg({ #,
#     'preprocessed_content': ' '.join,  # Combine preprocessed text
#     'Content': ' '.join,  # Combine raw text
#     'Experiment': 'first',   # Keep the first 
#     'Condition': 'first',   # Keep the first
#     'Id': 'first',   # Keep the first
# }).reset_index()
# df

In [None]:
docs = list(df.preprocessed_content)
print(len(docs))

In [None]:
# look rows with a specific word in the column preprocessed_content
df[df['preprocessed_content'].str.contains(" one ")]

In [None]:
0.2*len(docs)

In [None]:
(20 * 100) / len(docs)

# Define the model

In [None]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans



# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

hdbscan_model = KMeans(n_clusters=10)
#hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

main_representation_model = KeyBERTInspired()
aspect_representation_model1 = PartOfSpeech("en_core_web_sm")
aspect_representation_model2 = [KeyBERTInspired(top_n_words=30), 
                                MaximalMarginalRelevance(diversity=.5)]

representation_model = {
   "Main": main_representation_model,
   "Aspect1":  aspect_representation_model1,
   "Aspect2":  aspect_representation_model2 
}

In [324]:
topic_model = BERTopic(                       
# Pipeline models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model = representation_model,

# Hyperparameters
verbose=True,)
#min_topic_size=5)
#n_gram_range=(1,2),

In [None]:
topics, ini_probs = topic_model.fit_transform(docs)
num_topics = len(topic_model.get_topics()) - 1
num_topics

In [None]:
topic_model.get_topic_info().to_csv("topic_info.csv", index=False)
topic_model.get_topic_info()

In [None]:
topic_model.get_topic_info(-1).Count / len(docs)

In [None]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

In [None]:
topic_model.visualize_topics()

In [259]:
df['one_topic'] = topics
topic_name_to_id = dict(zip(topic_model.get_topic_info().Topic, topic_model.get_topic_info().Name))
df['one_topic_name'] = df['one_topic'].map(topic_name_to_id)

# Topic Distribution (More than one topic per docs) 
- To reduce the numbers of outliers

### Look similarities

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append(
            {
                'topic1': t1, 
                'topic2': t2, 
                'distance': rec[t2]
            }
        )

pair_dist_df = pd.DataFrame(tmp)

pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(
      lambda x: not x.startswith('-1'))) & 
            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending = False).head(20)

### Multiples topics

In [None]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 5, calculate_tokens=True)

In [None]:
import tqdm
import plotly.express as px

tmp_dfs = []

# iterating through different threshold levels
for thr in tqdm.tqdm(np.arange(0, 0.35, 0.001)):
    # calculating number of topics with probability > threshold for each document
    tmp_df = pd.DataFrame(list(map(lambda x: len(list(filter(lambda y: y >= thr, x))), topic_distr))).rename(
        columns = {0: 'num_topics'}
    )
    tmp_df['num_docs'] = 1
    
    tmp_df['num_topics_group'] = tmp_df['num_topics']\
        .map(lambda x: str(x) if x < 5 else '5+')
    
    # aggregating stats
    tmp_df_aggr = tmp_df.groupby('num_topics_group', as_index = False).num_docs.sum()
    tmp_df_aggr['threshold'] = thr
    
    tmp_dfs.append(tmp_df_aggr)

num_topics_stats_df = pd.concat(tmp_dfs).pivot(index = 'threshold', 
                              values = 'num_docs',
                              columns = 'num_topics_group').fillna(0)

num_topics_stats_df = num_topics_stats_df.apply(lambda x: 100.*x/num_topics_stats_df.sum(axis = 1))

# visualisation
colormap = px.colors.sequential.YlGnBu
px.area(num_topics_stats_df, 
       title = 'Distribution of number of topics',
       labels = {'num_topics_group': 'number of topics',
                'value': 'share of reviews, %'},
       color_discrete_map = {
          '0': colormap[0],
          '1': colormap[3],
          '2': colormap[4],
          '3': colormap[5],
          '4': colormap[6],
          '5+': colormap[7]
      })

In [263]:
threshold = 0.13

# define topic with probability > 0.13 for each document
df['multiple_topics'] = list(map(
    lambda doc_topic_distr: list(map(
        lambda y: y[0], filter(lambda x: x[1] >= threshold, 
                               (enumerate(doc_topic_distr)))
    )), topic_distr
))

# creating a dataset with docid, topic
tmp_data = []

for rec in df.to_dict('records'):
    if len(rec['multiple_topics']) != 0:
        mult_topics = rec['multiple_topics']
    else:
        mult_topics = [-1]
        
    for topic in mult_topics: 
        tmp_data.append(
            {
                'topic': topic,
                'id': rec['Content'],
            }
        )
            
mult_topics_df = pd.DataFrame(tmp_data)
df["multiple_topics_name"] = df["multiple_topics"].map(lambda x: [topic_name_to_id.get(i, "No topic") for i in x])


In [None]:
topic_distr

# Compute the porportion of topics per participants

In [None]:
df.head(5)

In [269]:
df.to_csv("df_topic.csv", index = False)

In [None]:
topic_name_to_id

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Group and count
counts_one_topic = df.groupby(["Experiment", "one_topic_name"]).size()

# 2. Pivot
counts_one_topic_pivot = counts_one_topic.unstack(fill_value=0)

# 3. Convert to row-wise % (each row sums to 100%)
perc_one_topic = counts_one_topic_pivot.div(counts_one_topic_pivot.sum(axis=1), axis=0) * 100

# 4. Optional stacked bar chart
ax = perc_one_topic.plot.bar(stacked=True, figsize=(10,6))
ax.set_ylabel("Percentage of Documents (%)")
ax.set_title("Percentage of one_topic_name by Experiment")
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.show()
