In [1]:
import pandas as pd

reducedPapers = pd.read_csv(
    "./processedData/processedData.csv",
    sep=','
)

reducedPapers.tail()

Unnamed: 0.1,Unnamed: 0,timePeriod,index,rawText,processedText
21802,21802,1820-01-01,9981,The_DT governor-general_NN of_IN India_NP had_...,india applied individual punishment prescribed...
21803,21803,1820-01-01,9982,We_PP have_VBP disdained_VBN to_TO run_VB a_DT...,disdained run race popularity nation order sec...
21804,21804,1820-01-01,9983,Mr._NP Philips_NP thought_VBD it_PP quite_RB u...,philip thought quite unworthy house time occup...
21805,21805,1820-01-01,9984,The_DT chief_JJ justice_NN sent_VBN into_IN th...,chief justice sent court common plea ask opini...
21806,21806,1820-01-01,9985,_'' <lb/> Lord_NP Bacon_NP adds_VBZ this_DT br...,lord bacon add brief sentence pregnant truth o...


In [2]:
from umap import UMAP
umap_model = UMAP(n_neighbors=30,
                  n_components=10,
                  min_dist=0.1,
                  metric='cosine',
                  random_state=100)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer() #stop_words=stopwords if added stopwords

In [4]:
from bertopic import BERTopic

topic_model = BERTopic(language="english",
                       calculate_probabilities=True,
                       top_n_words=4,
                       nr_topics=50,
                       min_topic_size=50,
                       n_gram_range=(1,2),
                       verbose=True
                       )


In [5]:
import pickle
reducedPapers.loc[:, 'processedText'] = reducedPapers['processedText'].astype("str")

topics = topic_model.fit_transform(reducedPapers['processedText'])
with open('./processedData/topics.pickle', 'wb') as f:
    pickle.dump(topics, f)
topic_model.get_topic_info()

Batches:   0%|          | 0/682 [00:00<?, ?it/s]

2023-04-09 22:14:50,279 - BERTopic - Transformed documents to Embeddings
2023-04-09 22:15:21,028 - BERTopic - Reduced dimensionality
2023-04-09 22:15:26,336 - BERTopic - Clustered reduced embeddings
2023-04-09 22:15:29,498 - BERTopic - Reduced number of topics from 6 to 6


Unnamed: 0,Topic,Count,Name
0,-1,195,-1_td_tr td_tr_td td
1,0,487,0_nan_nan nan_esq nan_simeon
2,1,349,1_gent gent_gent_gen_gen gent
3,2,200,2_right right_right_indeed right_right even
4,3,63,3_li_li li_including_le
5,4,20513,4_would_house_could_upon


In [6]:
import pickle
with open('./processedData/topics.pickle', 'rb') as f:
    topics = pickle.load(f)

In [52]:
topics_over_time = topic_model.topics_over_time(reducedPapers['processedText'],
                                                reducedPapers['timePeriod'],
                                                global_tuning=True,
                                              evolution_tuning=True,
                                                nr_bins=13)
with open('./processedData/topics_over_time.pickle', 'wb') as f:
    pickle.dump(topics_over_time, f)

13it [05:52, 27.13s/it]


In [53]:
import pickle
with open('./processedData/topics_over_time.pickle', 'rb') as f:
    topics_over_time = pickle.load(f)
topics_over_time.head()


Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"would, house, lord, could",7592,1799-10-19 22:50:52.800
1,0,"gentleman, chancellor, chancellor exchequer, e...",989,1799-10-19 22:50:52.800
2,1,"hon, right, would, right hon",824,1799-10-19 22:50:52.800
3,2,"army, officer, war, military",839,1799-10-19 22:50:52.800
4,3,"parliament, committee, amendment, government",399,1799-10-19 22:50:52.800


In [65]:
#topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=5)
topic_model.visualize_topics_over_time(topics_over_time)


In [55]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[1, 2])

In [56]:
topic_model.visualize_topics()

In [57]:
similarWords = (
    "Aristocracy "
    "birthright "
    "crest "
    "crown "
    "highness "
    "king "
    "monarch "
    "palace "
    "prince "
    "princess "
    "queen "
    "royal "
    "royalty "
    "majesty "
    "emperor "
)

In [61]:
similar_topics, similarity = topic_model.find_topics(similarWords, top_n=50)

In [64]:
topic_model.get_topic(similar_topics[0])

[('mr', 0.0352285178976172),
 ('gentleman', 0.03051244100470032),
 ('mr mr', 0.020836362344775143),
 ('noble', 0.018259203953243854)]

In [60]:
import hdbscan
import matplotlib.pyplot as plt

result = pd.DataFrame(topics, columns=['x', 'y'])

cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',
                          cluster_selection_method='eom'
                          ).fit(topics)

result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

ValueError: 2 columns passed, passed data had 161529 columns