# BERTopic news processor

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, timedelta, datetime

plt.rcParams['figure.figsize'] = [15, 7.5]
plt.rcParams['figure.facecolor'] = 'white'
#plt.style.use('dark_background')

comma_strip = lambda x: x.rstrip(",")

import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords \
              and token != " " \
              and token.strip() not in punctuation
              and token.strip() not in ["«", "»", "“", "”"]]
    text = " ".join(tokens)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dormant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
try:
    news = pd.read_csv('../DATA-MINING/SCRAPED-DATA/LEMM-NEWS-2018-1-1-2022-9-1.csv',
                       sep='\t',
                       on_bad_lines='skip')
except:
    news = pd.read_csv('../DATA-MINING/SCRAPED-DATA/NEWS-2018-1-1-2022-9-1.csv',
                       sep='\t',
                       on_bad_lines='skip')
    news['title'] = news['title'].apply(preprocess_text)
    news.to_csv('../DATA-MINING/SCRAPED-DATA/LEMM-NEWS-2018-1-1-2022-9-1.csv',
                sep='\t',
                index=False)
    
    # validate working file
    news = pd.read_csv('../DATA-MINING/SCRAPED-DATA/LEMM-NEWS-2018-1-1-2022-9-1.csv',
                       sep='\t',
                       on_bad_lines='skip')

In [3]:
print(f'Number of news headers: {len(news)}')
news.head()

Number of news headers: 411713


Unnamed: 0,date,title
0,2018-01-01,потанин оставаться вода
1,2018-01-01,автор хит 1 1 устраивать праздничный переполох
2,2018-01-01,северный корея принимать участие олимпиада южный
3,2018-01-01,зимний сказка петербург продлиться недолго
4,2018-01-01,алкогольный энергетик показывать красный свет


In [11]:
datify = lambda x: datetime.strptime(x, '%Y-%m-%d')
textify = lambda x: x.strftime('%Y-%m-%d')

start_d, end_d = datify(min(news['date'])), datify(max(news['date']))
delta_d = end_d - start_d
num_days = delta_d.days

print(f"From {textify(start_d)} to {textify(end_d)}")
print(f"Number of days: {num_days}")

num_bins = int(num_days/30)
print(f"{num_bins} is going to be the number of monthly bins.")

From 2018-01-01 to 2022-09-01
Number of days: 1704
56 is going to be the number of monthly bins.


In [12]:
news.tail()

Unnamed: 0,date,title
411708,2022-09-01,петербуржец напоминать изменяться общественный...
411709,2022-09-01,обгорать трешка проспект наука вытаскивать пос...
411710,2022-09-01,называть креативный двор санкт-петербург котор...
411711,2022-09-01,первый сентябрь петербург открываться 7 социал...
411712,2022-09-01,гороскоп весь знак зодиак 1 сентябрь 2022 год ...


In [39]:
from bertopic import BERTopic
from dateutil.relativedelta import relativedelta

timestamps = news['date'].to_list()[:num_bins*20]
titles = news['title'].to_list()[:num_bins*20]

try:
    topic_df = pd.read_csv('./PROCESSED-DATA/TOPICS.csv', sep=',', on_bad_lines='skip')
except:
    topic_model = BERTopic(embedding_model='distiluse-base-multilingual-cased-v1',
                           verbose=True)
    topics, probs = topic_model.fit_transform(titles)
    topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                                     topic_prefix=True,
                                                     word_length=10,
                                                     separator="_")
    
    topics_over_time = topic_model.topics_over_time(titles, timestamps, nr_bins=num_bins)
    topic_df = pd.DataFrame(topics_over_time)
    topic_df.rename(columns={"Timestamp": "Date"}, inplace=True)
    topic_df.fillna(0.0)
    
    new_index = pd.date_range(start=textify(start_d),
                              end=textify(end_d + relativedelta(months=-1)),
                              freq='MS')
    
    topic_df = topic_df.pivot(index="Date",
                              columns="Topic",
                              values="Frequency")
    
    topic_df = topic_df.reindex(topic_df.index.union(new_index)).interpolate(method='time')
    topic_df = topic_df.reindex(new_index)
    topic_df.set_axis(topic_labels, axis=1, inplace=True)
    topic_df.fillna(0.0)
    
    topic_df.to_csv('./PROCESSED-DATA/TOPICS.csv', sep=',')
    
    # validate working file
    topic_df = pd.read_csv('./PROCESSED-DATA/TOPICS.csv', sep=',', on_bad_lines='skip')

Batches: 100%|███████████████████████████████████████████████████████████| 35/35 [00:15<00:00,  2.31it/s]
2023-04-02 15:29:07,411 - BERTopic - Transformed documents to Embeddings
2023-04-02 15:29:11,185 - BERTopic - Reduced dimensionality
2023-04-02 15:29:11,243 - BERTopic - Clustered reduced embeddings
10it [00:00, 27.73it/s]


In [19]:
start_date = topic_df['Date'].min()
end_date = topic_df['Date'].max()
num_topics = topic_df['Topic'].nunique()

print(f"Number of topics: {num_topics}")
print(f"Number of entries: {len(topic_df)}")
print(f"Max topic number: {topic_df['Topic'].max()}")
print(f"Min timestamp: {start_date}")
print(f"Max timestamp: {end_date}")

Number of topics: 11
Number of entries: 65
Max topic number: 9
Min timestamp: 2017-12-31 23:52:48.000000000
Max timestamp: 2018-01-05 21:51:25.714285824


In [None]:
deltify = lambda x: (datify(x) - start_date).days - 1
topic_df['Date'] = topic_df['Date'].apply(deltify)

In [None]:
topic_df.head()

In [None]:
topic_df[topic_df['Topic'] == 0].head()

In [None]:
topic_ts = []     # time series
topic_names_list = []  # name list
num_topics = 700

for i in range(num_topics):
    ans = topic_df[topic_df['Topic'] == i]
    name = list(ans['Name'])
    topic_ts.append(ans)
    topic_names_list.append(name[0])

In [None]:
m_dates = []
for i in range(12):
    m_date = date(2020, i+1, 1)
    m_dates.append(m_date)

month_ticks = [deltify(item) for item in m_dates]
month_names = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", \
               "Oct", "Nov", "Dec"]
print(month_ticks)
print(month_names)

In [None]:
for num in range(num_topics):
    plt.plot(topic_ts[num]['Date'], topic_ts[num]['Frequency'])
    plt.xticks(month_ticks, labels=month_names)
    plt.xlim(-5, 299)
    plt.ylabel("Frequency")
    plt.xlabel("Date")

In [None]:
date_list = list(range(topic_df['Date'].max() + 1))

In [None]:
num = 0

from scipy.interpolate import interp1d

x_new = np.array(date_list)

x = topic_ts[num]['Date']
y = topic_ts[num]['Frequency']

f_lin = interp1d(x, y)
f_qua = interp1d(x, y, kind='quadratic')
f_cub = interp1d(x, y, kind='cubic')

plt.plot(x, y, 'o', x_new, f_lin(x_new), '-', x_new, \
         f_qua(x_new), '--', x_new, f_cub(x_new), '-.')
plt.xticks(month_ticks, labels=month_names)
plt.xlim(-5, 299)
plt.ylabel("Frequency")
plt.xlabel("Date")
plt.legend(['data', 'linear', 'quadratic', 'cubic'], loc='best')

### Quadratic spline interpolation chosen

In [None]:
num_days = len(x_new)
final_topics = 50
topic_time_series = np.zeros((final_topics, num_days))
topic_names = []

for i in range(final_topics):  
    x = topic_ts[i]['Date']
    y = topic_ts[i]['Frequency']
    f_qua = interp1d(x, y, kind='quadratic', bounds_error=False)
    ans = f_qua(x_new)
    
    topic_time_series[i] = np.array(ans)

In [None]:
print(f"Shape of array before: {topic_time_series.shape}")

topic_ts = np.delete(topic_time_series, slice(274, None, None), axis=1)
days_ts = np.delete(x_new, slice(274, None, None))

print(f"Shape of array after: {topic_ts.shape}")

for i in range(topic_ts.shape[0]):
    plt.plot(days_ts, topic_ts[i])
    plt.xticks(month_ticks, labels=month_names)
    plt.xlim(-5, 279)
    plt.ylabel("Frequency")
    plt.xlabel("Date")

### Save dataset

In [None]:
topic_names = np.array(topic_names_list)
np.savez('./sber_work_files/news_lemm_w_tp_names.npz', \
         topic_ts=topic_ts, topic_names=topic_names)

### Load dataset

In [None]:
npzfile = np.load('./sber_work_files/news_lemm_w_tp_names.npz', allow_pickle=True)
topics, names = npzfile['topic_ts'], npzfile['topic_names']
print(f"Shape of dataset: {topics.shape}")

for i in range(20): #topics.shape[0]):
    lbl = names[i]
    plt.plot(topics[i], label=lbl)
    plt.xticks(month_ticks, labels=month_names)
    plt.xlim(-5, 278)
    plt.ylabel("Frequency")
    plt.xlabel("Date")

plt.legend(prop={'size': 8})

In [None]:
for name in names[:50]:
    print(name)

In [None]:
# Manually sorted existing 50 topics
sorted_topic_names_dict = {}
with open("./sber_work_files/topics.txt", "r") as f:
    text = f.read().split('\n\n')
    for item in text:
        lines = item.split('\n')
        sorted_topic_names_dict.update({lines[0][:-1]: list(lines[1:])})

sorted_topic_names_dict['Правительство'] = sorted_topic_names_dict['Правительство'][:-1]
        
print(sorted_topic_names_dict.keys())
print(sorted_topic_names_dict['Путешествия'])

In [None]:
# save month_ticks, month_names, fin_top_ts, fin_top_names, sorted_topic_names_dict
fin_top_names = list(sorted_topic_names_dict.keys())
fin_top_ts = np.zeros((len(fin_top_names), topics.shape[1]))

for num, item in enumerate(sorted_topic_names_dict):
    tops = [int(i.split("_")[0]) for i in list(sorted_topic_names_dict[item])]
    for microtop in tops:
        fin_top_ts[num] += topics[microtop]

In [None]:
for i in range(fin_top_ts.shape[0]):
    lbl = fin_top_names[i]
    plt.plot(fin_top_ts[i], label=lbl)
    plt.xticks(month_ticks, labels=month_names)
    plt.xlim(-5, 278)
    plt.ylabel("Frequency")
    plt.xlabel("Date")

plt.legend(prop={'size': 8})

# Save final dataset

In [None]:
np.savez('./sber_work_files/news4work.npz', \
         month_ticks=month_ticks, \
         month_names=month_names, \
         fin_top_ts=fin_top_ts, \
         fin_top_names=fin_top_names, \
         sorted_topic_names_dict=sorted_topic_names_dict)

## Check dataset

In [None]:
npzfile = np.load('./sber_work_files/news4work.npz', allow_pickle=True)

month_ticks=npzfile['month_ticks']
month_names=npzfile['month_names']
fin_top_ts=npzfile['fin_top_ts']
fin_top_names=npzfile['fin_top_names']
sorted_topic_names_dict=npzfile['sorted_topic_names_dict']

In [None]:
for i in range(fin_top_ts.shape[0]):
    lbl = fin_top_names[i]
    plt.plot(fin_top_ts[i], label=lbl)
    plt.xticks(month_ticks, labels=month_names)
    plt.xlim(-5, 278)
    plt.ylabel("Frequency")
    plt.xlabel("Date")

plt.legend(prop={'size': 8})