In [101]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from bertopic import BERTopic 
from transformers import *

sys.path.append(os.path.dirname(os.path.abspath('..')))
from utils.text_analysis_functions import data_cleaning
from utils.modeling_helpers import summarize_doc

In [2]:
bertopic_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\notebooks\\data_processing\\modeling\\BERTopic_model"
exploded_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\notebooks\\data_processing\\modeling\\exploded_chunks.pkl"
data_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\working_data\\transformed_dataset.csv"
labeled_data_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\working_data\\labeled_dataset.csv"
embeddings_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\working_data\\my_data_embeddings.npy"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "nlpaueb/bert-base-greek-uncased-v1",
    use_fast=True)

cleaning_object = data_cleaning()

In [None]:
data_exploded = pd.read_pickle(exploded_path)
embeddings = np.load(embeddings_path)

# load model & transform
topic_model = BERTopic.load(bertopic_path)
topics, probs = topic_model.transform(
    data_exploded["chunks"],
    embeddings=embeddings
)
data_exploded["topic"] = topics
data_exploded["topic_prob"] = [p.max() for p in probs]

# summarize per doc
doc_topics = (
    data_exploded
      .groupby("doc_id")
      .apply(lambda grp: summarize_doc(grp, topic_model))
      .reset_index()
)

# merge back into the original raw data
data = pd.read_csv(data_path)
data["doc_id"] = data.index
data_labeled  = data.merge(doc_topics, on="doc_id", how="left")





In [5]:
data_labeled.to_csv(labeled_data_path, index=False)

### Load the labeled dataset

In [6]:
my_dataset = pd.read_csv(labeled_data_path)

In [None]:
my_dataset.head(3)

In [8]:
len(my_dataset[(my_dataset["dominant_topic"] == -1) | (my_dataset["dominant_topic"].isna())] )

9293

### Topics

In [None]:
fig, ax = plt.subplots()
my_dataset[my_dataset["dominant_topic"] != -1]["dominant_topic"].value_counts().plot(ax=ax, kind='bar')

Confident topic prediction greater than 70%

In [11]:
len(my_dataset[my_dataset["topic_prob"] >= 0.82]) / len(my_dataset)

0.5166289687526179

### Thematic Analysis

In [56]:
unique_pairs = my_dataset[["dominant_topic", "doc_id", "topic_words"]].groupby(["dominant_topic", "topic_words"]).size().reset_index(name="Count")
unique_pairs["Frequency (%)"] = (unique_pairs["Count"] / 23869) * 100
topics_names = ["Outliers",
                "Social Roles and Functions",
                "Divinity as Collective Identity",
                "Religious conservativism",
                "Emotionally triggered",
                "Fear & Panic",
                "Knowledge Power",
                "-",
                "Egalitarian",
                "-",
                "-",
                "Emotionally triggered",
                "-",
                "Equality hides in the details"]

unique_pairs["Topic name"] = topics_names
unique_pairs = unique_pairs.rename(columns={"dominant_topic": "Topic No", "topic_words": "Top 10 words"})
unique_pairs["Topic No"] = unique_pairs.index

In [None]:
unique_pairs 

In [None]:
print(unique_pairs.to_latex()) 

In [None]:
for_merge = unique_pairs[["Topic name","Topic No"]].copy()
for_merge["Topic No"] = for_merge["Topic No"] - 1
my_dataset_new = my_dataset.merge(for_merge, left_on="dominant_topic", right_on="Topic No", how="left")
my_dataset_new.drop("Topic No", axis=1, inplace=True)
my_dataset_new.head(2)

In [None]:
counts = (
    my_dataset_new
    .loc[my_dataset_new["Topic name"] != "-", "Topic name"]
    .value_counts()
)

total = counts.sum()

fig, ax = plt.subplots(figsize=(10, 6))
counts.plot(kind="barh", ax=ax)

for p in ax.patches:
    width = p.get_width()
    if width == 0:
        continue
    pct = 100 * width / total
    y = p.get_y() + p.get_height() / 2
    ax.annotate(f"{pct:.1f}%",
                (width, y),
                ha="left", va="center",
                xytext=(3, 0), textcoords="offset points")

ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
for _, row in unique_pairs.loc[0:15].iterrows():
    print(row["Topic No"], row["Topic name"])
    print(row["Top 10 words"])

In [None]:
for topic in list(my_dataset["dominant_topic"].unique()):
    if topic != -1 and pd.notna(topic):
        topic1_rows = my_dataset[(my_dataset["dominant_topic"] == topic) & (my_dataset["topic_prob"] > 0.95)]
        n = min(20, len(topic1_rows))
        sampled = topic1_rows[["comment_id", "text"]].sample(n=n, random_state=42)
        print(f"Topic {topic} samples:")
        for num, row in sampled.iterrows():
            print(f"{num+1}. (ID: {row['comment_id']}) {row['text']}")
        print("\n")

In [None]:
na_s = my_dataset[~my_dataset["dominant_topic"].notna()]
minus_ones = my_dataset[my_dataset["dominant_topic"] == -1]
valids = my_dataset[(my_dataset["dominant_topic"] != -1) & (my_dataset["dominant_topic"].notna())]
len(na_s) / len(my_dataset)

0.1533467370361062

### Over time

In [43]:
data = pd.read_csv(data_path)
exploded = data_exploded.copy()
data["doc_id"] = data.index
exploded = exploded.merge(
    data[["doc_id", "date", "date_mini", "like_scaled_norm"]],
    on="doc_id",
    how="left")
assert len(exploded) == embeddings.shape[0]

In [44]:
for_time = exploded[~exploded["topic"].isna()]
comments = for_time["text"]
timestamps = for_time["date"]
topics = for_time["topic"]

In [45]:
topics_over_time = topic_model.topics_over_time(comments, timestamps, nr_bins=20) 

In [None]:
NORMALIZE_TO_SHARE = True 

df = my_dataset_new.copy()

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

df['Topic name'] = df['Topic name'].fillna('Unknown').astype(str)
df = df[~df['Topic name'].isin(['Outliers', '-', 'Unknown'])].copy()

df['like_scaled'] = pd.to_numeric(df['like_scaled'], errors='coerce').fillna(0.0)

topics_all = sorted(df['Topic name'].unique())
cmap = mpl.cm.get_cmap('tab20', len(topics_all))
TOPIC_COLORS = {t: cmap(i) for i, t in enumerate(topics_all)}

def weighted_topic_over_time(data: pd.DataFrame, freq: str, normalize: bool=False) -> pd.DataFrame:
    """
    Returns a wide DF: index=time bucket, columns=Topic name, values = sum(like_scaled).
    If normalize=True, each row is divided by the row total (shares).
    """
    weights = (
        data
        .groupby([pd.Grouper(key='date', freq=freq), 'Topic name'])['like_scaled']
        .sum()
        .unstack(fill_value=0.0)
        .sort_index()
    )

    for t in topics_all:
        if t not in weights.columns:
            weights[t] = 0.0
    weights = weights[topics_all] 

    if normalize:
        row_sums = weights.sum(axis=1).replace(0, pd.NA)
        weights = weights.div(row_sums, axis=0).fillna(0.0)
    return weights

def plot_weighted(counts: pd.DataFrame, title: str, normalize: bool=False):
    fig, ax = plt.subplots(figsize=(12, 6))
    for t in counts.columns:
        ax.plot(counts.index, counts[t], label=t, linewidth=2, color=TOPIC_COLORS[t])
    ax.set_title(title, fontsize=16, pad=12)
    ax.set_xlabel("Date", fontsize=13)
    ax.set_ylabel("Share" if normalize else "Weighted frequency (sum of like_scaled)", fontsize=13)
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
    ax.tick_params(axis='x', rotation=45)
    ax.legend(title="Topic name", bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)
    plt.tight_layout()
    plt.show()

for freq, label in [('YS', 'Yearly'), ('MS', 'Monthly'), ('W-MON', 'Weekly (Mon)')]:
    counts = weighted_topic_over_time(df, freq=freq, normalize=NORMALIZE_TO_SHARE)
    plot_weighted(counts, f"Topic {('shares' if NORMALIZE_TO_SHARE else 'weighted frequencies')} — {label}", normalize=NORMALIZE_TO_SHARE)

for period_value, g in df.groupby('period'):
    if g.empty:
        continue
    counts = weighted_topic_over_time(g, freq='W-MON', normalize=NORMALIZE_TO_SHARE)
    plot_weighted(counts, f"Weekly topic {('shares' if NORMALIZE_TO_SHARE else 'weighted frequencies')} — period: {period_value}", normalize=NORMALIZE_TO_SHARE)