Step 1. Install lib



In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import nltk
import re
from nltk.corpus import stopwords
from hdbscan import HDBSCAN
from umap import UMAP
import pymorphy2
from nltk import word_tokenize   

class LemmaTokenizer:
    def __init__(self):
        self.wnl = pymorphy2.MorphAnalyzer()
    def __call__(self, doc):
        return [self.wnl.parse(t)[0].normal_form for t in word_tokenize(doc)]

nltk.download('stopwords')
stopWords = stopwords.words("russian")

import nltk
nltk.download('punkt')

Step 2. Dump dataset with headlines

In [None]:
df = pd.read_csv('PATH/df_result.csv') # read df_result.csv

# def change_title(text):
#   return re.sub('[^А-яЁёA-z]', ' ', text.lower())
# df['title_new'] = df['title_new'].apply(change_title)
# def change_time(dt):
#     dt_new = dt[0:10]
#     return datetime.strptime(dt_new, '%Y-%m-%d')
# df['title_new'] = df['title_new'].fillna(df['page_title'])
# df = df.dropna()
# df['fetchdate_check'] = df['fetchdate_check'].apply(change_time)
# df['fetchdate_orig'] = df['fetchdate_orig'].apply(change_time)
# df.drop_duplicates(subset=['title_new', 'page_domain_root'],keep='first',inplace=True)
# df_result = df.reset_index().drop(columns = ['Unnamed: 0', 'index'],axis = 1)
# df_result.to_csv("cleaned_data.csv")
# df_result.to_csv("df_result.csv")

titles = df.title_new.to_list()   
timestamps = df.fetchdate_orig.to_list()

Step 3. Learn Bertopic model. Skip this step if the model had existed.

In [None]:
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

hdbscan_model = HDBSCAN(min_cluster_size=200, metric='euclidean', 
                        cluster_selection_method='eom', prediction_data=True, min_samples=3)

umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False, random_state=17)


topic_model = BERTopic(embedding_model=sentence_model, diversity=0.3, hdbscan_model=hdbscan_model, umap_model=umap_model,  top_n_words=30, nr_topics="auto", verbose=True,)
topics, probs = topic_model.fit_transform(titles)

vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=stopWords, tokenizer=LemmaTokenizer(), min_df=15)

topic_model.update_topics(titles, topics, vectorizer_model=vectorizer_model)


with open("topics.txt", "w") as f:
    for t in topics:
        f.write(str(t) +"\n")

topic_model.save("model")

Step 4. Load model and merge the topics

In [None]:
topic_model = BERTopic.load("PATH/model") # import model 

topics = []
with open("PATH/topics.txt", "r") as f:
  for line in f:
    topics.append(int(line.strip()))

In [None]:
"""Get data with the list of topics"""
df = topic_model.get_topic_info()
# df.to_csv("list_topics.csv")

In [None]:
"""Collect info about topic in united DataFrame"""

import numpy as np
def _clean_tuples(series: pd.Series) -> pd.Series:
    series[series.str[0] == ""] = np.nan
    return series

def get_topics() -> pd.DataFrame:
    """Get all topics with top words and their scores."""
    df1 = pd.DataFrame.from_dict(md.get_topics()).T.add_prefix("word_score_")
    df1.reset_index(inplace=True)
    df1.rename(columns={"index": "Topic"}, inplace=True)
    tuple_cols = df1.filter(like="word_score_").columns
    df1[tuple_cols] = df1[tuple_cols].apply(_clean_tuples)
    return df1

def represent() -> pd.DataFrame:
    representative_docs = md.get_representative_docs()
    dict_r = {}
    for k, v in representative_docs.items():
        dict_r[str(k)] = re.sub('[^А-яЁёA-z0-9,.:"?! ]', '', str(v))
    df_r = pd.DataFrame.from_dict(dict_r, orient='index').reset_index()
    df_r = df_r.rename(columns={"index": "Topic", 0: "Headlines"})
    return df_r

def unite() -> pd.DataFrame:
    temp = pd.merge(df, get_topics(), on=['Topic'])
    temp["Topic"] = temp["Topic"].astype(str)
    result_df = pd.merge(temp, represent(), on=['Topic'])
    result_df.to_csv("info_topics.csv")
    # result_df.to_json("info_topics.json")

unite()

In [None]:
"""Get data with all headlines in topics"""
df = pd.DataFrame({'topic': topics, 'document': titles})
df[df.topic > 0].reset_index().sort_values(by="topic")[["topic", "document"]].to_csv("all_headlines.csv")

Step 5. Visualise topics over time

In [None]:
topics_over_time = topic_model.topics_over_time(titles, topics, timestamps, nr_bins=30)
topics_over_time.to_csv("topics_over_time.csv")

In [None]:
import pandas as pd
topics_over_time = pd.read_csv('PATH/topics_over_time.csv') # read topics_over_time.csv on google drive

In [None]:
import datetime as dt
topics_over_time["Timestamp"] = topics_over_time["Timestamp"].dt.to_period("D")


In [None]:
topics_over_time.to_csv("time.csv")

In [None]:
topics_over_time.loc[(topics_over_time['Topic'] == 0) & (topics_over_time['Topic'] == 1)]

Unnamed: 0.1,Unnamed: 0,Topic,Words,Frequency,Timestamp


Narrative #14 Goals and terms of special operation

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [94, 99, 145, 181, 193])

Narrative #13 Preparation to the war

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [12, 33])

Narrative #3-4 Military actions

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [0, 25, 31, 64, 80, 106, 113, 118, 144, 158, 164, 175, 225, 230])

Narrative #10 Negotiations

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [14, 57, 61, 104, 110, 159, 207])

Narrative #5 Russia-defender 

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [1, 90, 95, 96, 112, 117, 122, 161, 178, 183, 194, 213, 228])

Narrative #6 Inner enemies

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [91, 136, 157, 160])

Narrative #11 Russian allies

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [6, 13, 38, 85, 92, 100, 149, 174, 216])

Narrative #2 Nedogosudarstvo

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [2, 60, 135, 166])

Narrative #1 Nazism

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [69, 83, 124])

Narrative #8 Sanctions

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [3, 5, 7, 8, 17, 22, 40, 46, 49, 65, 82, 87, 93, 114, 134, 140, 141, 146, 154, 162, 167, 169, 172, 173, 179, 184, 185, 195, 197, 199, 200, 202, 210, 212, 214, 217, 220, 222, 229])


Narrative #7 Cold war

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [19, 28, 30, 34, 37, 47, 55, 59, 63, 71, 73, 105, 125, 139, 151, 163, 177, 190, 198, 205, 211 ])

Narrative #12 West not unite

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [4, 10, 15, 21, 23, 27, 32, 39, 68, 107, 121, 132, 201, 231])

Narrative #9 Russian values

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [36, 53, 67, 76, 84, 86, 120, 142, 143, 150, 152, 155, 156, 168, 191, 196, 227])

All narratives

In [None]:
topics_rel = [69, 83, 124, 2, 60, 135, 166, 1, 90, 95, 96, 112, 117, 122, 161, 178, 183, 194, 213, 228, 91, 136, 157, 160, 19, 28, 30, 34, 37, 47, 55, 59, 63, 71, 73, 105, 125, 139, 151, 163, 177, 190, 198, 205, 211, 3, 5, 7, 8, 17, 22, 40, 46, 49, 65, 82, 87, 93, 114, 134, 140, 141, 146, 154, 162, 167, 169, 172, 173, 179, 184, 185, 195, 197, 199, 200, 202, 210, 212, 214, 217, 220, 222, 229, 36, 53, 67, 76, 84, 86, 120, 142, 143, 150, 152, 155, 156, 168, 191, 196, 227, 14, 57, 61, 104, 110, 159, 207, 6, 13, 38, 85, 92, 100, 149, 174, 216, 4, 10, 15, 21, 23, 27, 32, 39, 68, 107, 121, 132, 201, 231, 12, 33, 94, 99, 145, 181, 193, 11, 41, 180, 0, 25, 31, 64, 80, 106, 113, 118, 144, 158, 164, 175, 225, 230]

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics=topics_rel, height=700)

Mapping

In [None]:
topic_model.visualize_documents(titles, topics=topics_map, hide_document_hover=False, hide_annotations=False, width=2000).write_html("doc_viz.html")


In [None]:
topics_over_time.to_csv("topics_time.csv")