In [39]:
pd.read_excel("samples_cleaned.xlsx")["topics"]

0            sanctions
1      unjustified_war
2                  NaN
3        people_killed
4            sanctions
            ...       
101      people_killed
102    unjustified_war
103    unjustified_war
104    unjustified_war
105      arms_delivery
Name: topics, Length: 106, dtype: object

In [143]:
from src.utility import save_csv_with_embeddings, load_samples_with_numpy
import pandas as pd
from src.explorativ_analysis_05 import split_strings_to_list
from src.stop_words import stop_words

df = pd.read_excel("samples_cleaned.xlsx")

df = df[~df["topics"].isna()]
df = df[
    df["topics"].isin(
        ["sanctions", "unjustified_war", "arms_delivery", "people_killed"]
    )
]

to_transform = [
    "lemmas",
    "adjs_verbs",
    "nouns",
    "translated_lemmas",
    "translated_adjs_verbs",
    "translated_nouns",
]

for col in to_transform:
    df["listed_" + col] = df[col].apply(split_strings_to_list)

df["listed_stop_word_removed_lemmas"] = df.apply(
    lambda x: [
        word for word in x["listed_lemmas"] if not word in stop_words[x["lang"]]
    ],
    axis=1,
)
df["listed_translated_stop_word_removed_lemmas"] = df.apply(
    lambda x: [
        word for word in x["listed_translated_lemmas"] if not word in stop_words["en"]
    ],
    axis=1,
)

df["stop_word_removed_lemmas"] = df["listed_stop_word_removed_lemmas"].str.join(" ")
df["translated_stop_word_removed_lemmas"] = df[
    "listed_translated_stop_word_removed_lemmas"
].str.join(" ")

for col in to_transform:
    df[col] = df[col].apply(split_strings_to_list).str.join(" ")

df = df[
    [
        "text",
        "translated",
        "cleaned_text_translated",
        "cleaned_text",
        "lemmas",
        "adjs_verbs",
        "nouns",
        "stop_word_removed_lemmas",
        "translated_lemmas",
        "translated_adjs_verbs",
        "translated_nouns",
        "translated_stop_word_removed_lemmas",
        "topics",
        "listed_stop_word_removed_lemmas",
        "listed_translated_stop_word_removed_lemmas",
        "listed_lemmas",
        "listed_adjs_verbs",
        "listed_nouns",
        "listed_translated_lemmas",
        "listed_translated_adjs_verbs",
        "listed_translated_nouns",
    ]
]

top_to_numbers = {
    "sanctions": 0,
    "unjustified_war": 1,
    "people_killed": 2,
    "arms_delivery": 3,
}
df["topics"] = df["topics"].apply(lambda x: top_to_numbers[x])
print(df.shape)
df.columns

(100, 21)


Index(['text', 'translated', 'cleaned_text_translated', 'cleaned_text',
       'lemmas', 'adjs_verbs', 'nouns', 'stop_word_removed_lemmas',
       'translated_lemmas', 'translated_adjs_verbs', 'translated_nouns',
       'translated_stop_word_removed_lemmas', 'topics',
       'listed_stop_word_removed_lemmas',
       'listed_translated_stop_word_removed_lemmas', 'listed_lemmas',
       'listed_adjs_verbs', 'listed_nouns', 'listed_translated_lemmas',
       'listed_translated_adjs_verbs', 'listed_translated_nouns'],
      dtype='object')

## Calculate ALL the embeddings

In [30]:
from sentence_transformers import SentenceTransformer

models = {}

models["model_labse"] = SentenceTransformer("sentence-transformers/LaBSE")
models["model_parahprase_min"] = SentenceTransformer(
    "paraphrase-multilingual-MiniLM-L12-v2"
)
models["model_parahprase_max"] = SentenceTransformer(
    "paraphrase-multilingual-mpnet-base-v2"
)
models["eng_model"] = SentenceTransformer("all-MiniLM-L6-v2")

In [140]:
# lda_spaces = [
#     "lemmas",
#     "adjs_verbs",
#     "nouns",
#     "translated_lemmas",
#     "translated_adjs_verbs",
#     "translated_nouns",
# ]

lda_spaces = [
    "translated_stop_word_removed_lemmas",
    "stop_word_removed_lemmas",
]

lda_spaces = ["listed_" + x for x in lda_spaces]

In [132]:
# preprocessing_steps = [
#     "text",
#     "translated",
#     "cleaned_text_translated",
#     "cleaned_text",
#     "lemmas",
#     "adjs_verbs",
#     "nouns",
#     "translated_lemmas",
#     "translated_adjs_verbs",
#     "translated_nouns",
# ]

preprocessing_steps = [
    "translated_stop_word_removed_lemmas",
    "stop_word_removed_lemmas",
]

In [133]:
preprocessing_steps

['translated_stop_word_removed_lemmas', 'stop_word_removed_lemmas']

In [144]:
from tqdm import tqdm


for preprocessing in tqdm(preprocessing_steps):
    for model_name, model in models.items():
        df[f"{model_name}_{preprocessing}_embeddings"] = df[preprocessing].apply(
            model.encode
        )

100%|██████████| 2/2 [00:28<00:00, 14.50s/it]


In [137]:
df.columns

Index(['text', 'translated', 'cleaned_text_translated', 'cleaned_text',
       'lemmas', 'adjs_verbs', 'nouns', 'stop_word_removed_lemmas',
       'translated_lemmas', 'translated_adjs_verbs', 'translated_nouns',
       'translated_stop_word_removed_lemmas', 'topics', 'listed_lemmas',
       'listed_adjs_verbs', 'listed_nouns', 'listed_translated_lemmas',
       'listed_translated_adjs_verbs', 'listed_translated_nouns',
       'model_labse_translated_stop_word_removed_lemmas_embeddings',
       'model_parahprase_min_translated_stop_word_removed_lemmas_embeddings',
       'model_parahprase_max_translated_stop_word_removed_lemmas_embeddings',
       'eng_model_translated_stop_word_removed_lemmas_embeddings',
       'model_labse_stop_word_removed_lemmas_embeddings',
       'model_parahprase_min_stop_word_removed_lemmas_embeddings',
       'model_parahprase_max_stop_word_removed_lemmas_embeddings',
       'eng_model_stop_word_removed_lemmas_embeddings'],
      dtype='object')

In [100]:
embeddings_columns = [
    "model_labse_text_embeddings",
    "model_parahprase_min_text_embeddings",
    "model_parahprase_max_text_embeddings",
    "eng_model_text_embeddings",
    "model_labse_translated_embeddings",
    "model_parahprase_min_translated_embeddings",
    "model_parahprase_max_translated_embeddings",
    "eng_model_translated_embeddings",
    "model_labse_cleaned_text_translated_embeddings",
    "model_parahprase_min_cleaned_text_translated_embeddings",
    "model_parahprase_max_cleaned_text_translated_embeddings",
    "eng_model_cleaned_text_translated_embeddings",
    "model_labse_cleaned_text_embeddings",
    "model_parahprase_min_cleaned_text_embeddings",
    "model_parahprase_max_cleaned_text_embeddings",
    "eng_model_cleaned_text_embeddings",
    "model_labse_lemmas_embeddings",
    "model_parahprase_min_lemmas_embeddings",
    "model_parahprase_max_lemmas_embeddings",
    "eng_model_lemmas_embeddings",
    "model_labse_adjs_verbs_embeddings",
    "model_parahprase_min_adjs_verbs_embeddings",
    "model_parahprase_max_adjs_verbs_embeddings",
    "eng_model_adjs_verbs_embeddings",
    "model_labse_nouns_embeddings",
    "model_parahprase_min_nouns_embeddings",
    "model_parahprase_max_nouns_embeddings",
    "eng_model_nouns_embeddings",
    "model_labse_translated_lemmas_embeddings",
    "model_parahprase_min_translated_lemmas_embeddings",
    "model_parahprase_max_translated_lemmas_embeddings",
    "eng_model_translated_lemmas_embeddings",
    "model_labse_translated_adjs_verbs_embeddings",
    "model_parahprase_min_translated_adjs_verbs_embeddings",
    "model_parahprase_max_translated_adjs_verbs_embeddings",
    "eng_model_translated_adjs_verbs_embeddings",
    "model_labse_translated_nouns_embeddings",
    "model_parahprase_min_translated_nouns_embeddings",
    "model_parahprase_max_translated_nouns_embeddings",
    "eng_model_translated_nouns_embeddings",
]

len(embeddings_columns)

40

In [138]:
embeddings_columns = [
    "model_labse_translated_stop_word_removed_lemmas_embeddings",
    "model_parahprase_min_translated_stop_word_removed_lemmas_embeddings",
    "model_parahprase_max_translated_stop_word_removed_lemmas_embeddings",
    "eng_model_translated_stop_word_removed_lemmas_embeddings",
    "model_labse_stop_word_removed_lemmas_embeddings",
    "model_parahprase_min_stop_word_removed_lemmas_embeddings",
    "model_parahprase_max_stop_word_removed_lemmas_embeddings",
    "eng_model_stop_word_removed_lemmas_embeddings",
]

In [139]:
import itertools
import numpy as np


def map_to_numbers(topic_a: list[int], topic_b: list[int]):
    permutations = [
        {-1: -1, **{key: val for key, val in zip((0, 1, 2, 3), perm)}}
        for perm in list(itertools.permutations([0, 1, 2, 3]))
    ]

    dist = [
        (np.array(topic_a) == np.array([perm[x] for x in topic_b])).sum()
        for perm in permutations
    ]
    permutation = permutations[dist.index(max(dist))]

    return (
        np.array(topic_a) == np.array([permutation[x] for x in topic_b])
    ).sum(), permutation


def get_best_topics(topics: list[list[int]], topic: list[int]) -> tuple[list[int], int]:

    distances = [map_to_numbers(top, topic)[0] for top in topics]
    best_topic = topics[distances.index(max(distances))]

    dist, permutation = map_to_numbers(topic, best_topic)

    return [permutation[x] for x in best_topic], dist

In [103]:
df_topics = df[["topics", "translated"]].copy(deep=True)

In [145]:
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en import stop_words
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm

count_vect = CountVectorizer(
    ngram_range=(1, 1),
    min_df=1,
    max_df=1.0,
    lowercase=False,
    stop_words=list(stop_words.STOP_WORDS),
)

best_lda_topics = []

for lda_space in lda_spaces:
    X_tf = count_vect.fit_transform(df[lda_space].str.join(" "))

    num_topics = 4

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10)

    lda_topic_list = []
    for _ in range(1000):
        lda.fit(X_tf)

        lda_topics = [lda.transform(x).argmax() for x in X_tf]
        lda_topic_list.append(lda_topics)
    best_topic, dist = get_best_topics(lda_topic_list, df["topics"].to_list())

    print(f"{lda_space} score: {dist}")

    best_lda_topics.append({lda_space: best_topic})

listed_translated_stop_word_removed_lemmas score: 48
listed_stop_word_removed_lemmas score: 47


In [125]:
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en import stop_words
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm

count_vect = CountVectorizer(
    ngram_range=(1, 1),
    min_df=1,
    max_df=1.0,
    lowercase=False,
    stop_words=list(stop_words.STOP_WORDS),
)

best_lda_topics = []

for lda_space in lda_spaces:
    X_tf = count_vect.fit_transform(df[lda_space].str.join(" "))

    num_topics = 4

    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10)

    lda_topic_list = []
    for _ in range(100):
        lda.fit(X_tf)

        lda_topics = [lda.transform(x).argmax() for x in X_tf]
        lda_topic_list.append(lda_topics)
    best_topic, dist = get_best_topics(lda_topic_list, df["topics"].to_list())

    print(f"{lda_space} score: {dist}")

    best_lda_topics.append({lda_space: best_topic})

listed_lemmas score: 43
listed_adjs_verbs score: 43
listed_nouns score: 43
listed_translated_lemmas score: 45
listed_translated_adjs_verbs score: 46
listed_translated_nouns score: 46


In [146]:
for name, topics in {
    name: topics for x in best_lda_topics for name, topics in x.items()
}.items():
    df_topics[name + "_lda"] = topics

In [113]:
random_topics = [
    np.random.choice([0, 1, 2, 3], size=(100), replace=True, p=None).tolist()
    for _ in range(1000)
]
best_topic, dist = get_best_topics(random_topics, df["topics"].to_list())
df_topics["random_topics_best_of_1000"] = best_topic
print(dist)

41


In [114]:
random_topics = [
    np.random.choice([0, 1, 2, 3], size=(100), replace=True, p=None).tolist()
    for _ in range(100)
]
best_topic, dist = get_best_topics(random_topics, df["topics"].to_list())
df_topics["random_topics_best_of_100"] = best_topic
print(dist)

39


In [147]:
from tqdm import tqdm
from bertopic import BERTopic
import numpy as np
from sklearn.cluster import KMeans
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

best_kmeans_topics = {}
cluster_model = KMeans(n_clusters=4)
topic_model = BERTopic(hdbscan_model=cluster_model)


for embedding in embeddings_columns:

    kmeans_topics_list = []

    for _ in range(100):
        topics, probs = topic_model.fit_transform(
            df["text"].to_list(),
            embeddings=np.stack(df[embedding].to_list(), axis=0),
        )
        kmeans_topics_list.append(topics)

    best_topic, dist = get_best_topics(kmeans_topics_list, df["topics"].to_list())

    print(f"{embedding} score: {dist}")

    best_kmeans_topics[embedding] = best_topic

model_labse_translated_stop_word_removed_lemmas_embeddings score: 81
model_parahprase_min_translated_stop_word_removed_lemmas_embeddings score: 83
model_parahprase_max_translated_stop_word_removed_lemmas_embeddings score: 86
eng_model_translated_stop_word_removed_lemmas_embeddings score: 83
model_labse_stop_word_removed_lemmas_embeddings score: 79
model_parahprase_min_stop_word_removed_lemmas_embeddings score: 79
model_parahprase_max_stop_word_removed_lemmas_embeddings score: 83
eng_model_stop_word_removed_lemmas_embeddings score: 40


In [150]:
for name, topics in best_kmeans_topics.items():
    df_topics["kmeans_" + name] = topics

  df_topics["kmeans_" + name] = topics


In [148]:
from tqdm import tqdm
from bertopic import BERTopic
import numpy as np
from hdbscan import HDBSCAN


best_bert_topics = {}
hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    max_cluster_size=40,
    min_samples=3,
)
topic_model = BERTopic(hdbscan_model=hdbscan_model)
for embedding in embeddings_columns:

    bert_topics_list = []

    while len(bert_topics_list) < 100:
        topics, probs = topic_model.fit_transform(
            df["text"].to_list(),
            embeddings=np.stack(df[embedding].to_list(), axis=0),
        )
        if len(set(topics)) == 5 and (np.array(topics) == -1).sum() < 15:
            bert_topics_list.append(topics)

        if len(set(topics)) == 4:
            topics = [x + 1 for x in topics]
            bert_topics_list.append(topics)

    best_topic, dist = get_best_topics(bert_topics_list, df["topics"].to_list())

    print(f"{embedding} score: {dist}")

    best_bert_topics[embedding] = best_topic

model_labse_translated_stop_word_removed_lemmas_embeddings score: 76
model_parahprase_min_translated_stop_word_removed_lemmas_embeddings score: 78
model_parahprase_max_translated_stop_word_removed_lemmas_embeddings score: 78
eng_model_translated_stop_word_removed_lemmas_embeddings score: 84
model_labse_stop_word_removed_lemmas_embeddings score: 72
model_parahprase_min_stop_word_removed_lemmas_embeddings score: 74
model_parahprase_max_stop_word_removed_lemmas_embeddings score: 80
eng_model_stop_word_removed_lemmas_embeddings score: 39


In [151]:
for name, topics in best_bert_topics.items():
    df_topics["hdbscan_" + name] = topics

  df_topics["hdbscan_" + name] = topics
  df_topics["hdbscan_" + name] = topics
  df_topics["hdbscan_" + name] = topics
  df_topics["hdbscan_" + name] = topics
  df_topics["hdbscan_" + name] = topics
  df_topics["hdbscan_" + name] = topics
  df_topics["hdbscan_" + name] = topics
  df_topics["hdbscan_" + name] = topics


In [154]:
df_topics.shape

(100, 109)

In [155]:
df_topics.to_csv(
    "/Users/robinfeldmann/TopicAnalysisRUWTweets/DataGitHub/CaseStudyEval/topics_2.csv"
)

In [126]:
df_topics

Unnamed: 0,topics,translated,listed_lemmas_lda,listed_adjs_verbs_lda,listed_nouns_lda,listed_translated_lemmas_lda,listed_translated_adjs_verbs_lda,listed_translated_nouns_lda,random_topics,random_topics_best_of_1000,...,hdbscan_model_parahprase_max_translated_lemmas_embeddings,hdbscan_eng_model_translated_lemmas_embeddings,hdbscan_model_labse_translated_adjs_verbs_embeddings,hdbscan_model_parahprase_min_translated_adjs_verbs_embeddings,hdbscan_model_parahprase_max_translated_adjs_verbs_embeddings,hdbscan_eng_model_translated_adjs_verbs_embeddings,hdbscan_model_labse_translated_nouns_embeddings,hdbscan_model_parahprase_min_translated_nouns_embeddings,hdbscan_model_parahprase_max_translated_nouns_embeddings,hdbscan_eng_model_translated_nouns_embeddings
0,0,#UkraineWar: The #EU #sanctions against #Russi...,0,0,2,3,0,0,1,0,...,0,0,3,3,1,3,0,3,0,0
1,1,"Vogt-Wuchter: ""We are shocked by the war in Uk...",1,0,3,2,1,1,1,1,...,-1,3,3,3,1,3,1,3,1,-1
3,2,The Interior Ministry of #Ukraine reported on ...,2,0,0,2,0,2,2,2,...,2,2,3,0,1,2,3,2,2,2
4,0,@Federal Chancellor STOP imports from Russia i...,1,1,0,0,2,0,0,0,...,0,0,2,0,0,0,0,0,0,0
5,0,#Ukraine\nEveryone complains about high gas pr...,1,1,0,0,2,2,3,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,2,Ukraine: at least two civilians killed and 10 ...,1,1,2,2,2,2,2,2,...,2,2,0,0,1,2,2,3,3,2
102,1,ALERT - The International Criminal Court issue...,1,1,1,2,1,2,3,1,...,1,1,1,0,1,1,1,1,0,1
103,1,🔴 #UkraineRussianWar\nURGENT‼️\nThe Internatio...,1,1,1,3,1,2,1,1,...,1,1,1,0,1,1,1,1,0,1
104,1,War in #Ukraine: #Russia committed wide range ...,1,1,1,0,1,1,1,1,...,1,1,1,3,3,3,0,1,1,1
