In [27]:
import pandas as pd

reducedPapers = pd.read_csv(
    "../processedData/processedData.csv",
    sep=','
)
reducedPapers.dropna(subset = ['processedText'], inplace = True, how='any')
reducedPapers.reset_index(drop=True, inplace=True)
reducedPapers.head()

Unnamed: 0.1,Unnamed: 0,timePeriod,index,rawText,processedText
0,1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...,gallant general commanded well knew soon arrive
1,2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d...",said doubted necessary insert clause bill purp...
2,3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ...",informs romish bishop appointed pope pastoral ...
3,4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...,former time former war invasion often threaten...
4,5,1800-01-01,6,"He_PP supported_VBD the_DT motion_NN ,_, becau...",supported motion harm much good


In [28]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)


In [29]:
from sklearn.feature_extraction.text import CountVectorizer

# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2))


In [30]:
from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language='english',
    calculate_probabilities=True,
    verbose=True
)

In [31]:
import pickle
reducedPapers.loc[:, 'processedText'] = reducedPapers['processedText'].astype("str")
topics, probs = model.fit_transform(reducedPapers['processedText'])
with open('../processedData/topics.pickle', 'wb') as f:
    pickle.dump(topics, f)

Batches:   0%|          | 0/644 [00:00<?, ?it/s]

2023-04-09 22:48:59,786 - BERTopic - Transformed documents to Embeddings
2023-04-09 22:49:07,312 - BERTopic - Reduced dimensionality
2023-04-09 22:49:10,118 - BERTopic - Clustered reduced embeddings


In [61]:
hierarchical_topics = model.hierarchical_topics(reducedPapers['processedText'])

100%|██████████| 36/36 [00:00<00:00, 67.87it/s]


In [62]:
tree = model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─year_country_tax_trade_price
│    ├─■──amount_million_amounted_beer_including ── Topic: 20
│    └─year_country_tax_trade_price
│         ├─country_tax_trade_year_price
│         │    ├─tax_country_trade_year_debt
│         │    │    ├─■──corn_tax_price_irish_grain ── Topic: 15
│         │    │    └─debt_trade_country_sum_year
│         │    │         ├─■──debt_sum_money_year_tax ── Topic: 5
│         │    │         └─■──trade_country_commerce_colony_state ── Topic: 12
│         │    └─■──circulation_currency_note_paper_gold ── Topic: 31
│         └─■──year_estimate_last year_loan_last ── Topic: 28
└─noble_country_lord_bill_said
     ├─noble_bill_country_lord_said
     │    ├─country_bill_subject_lord_said
     │    │    ├─■──danger_country_security_dangerous_safety ── Topic: 19
     │    │    └─country_bill_subject_lord_said
     │    │         ├─country_lord_subject_noble_said
     │    │         │    ├─■──punishment_offence_libel_law_crime ── Topic: 8
     │    │         │    └─c

In [63]:
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [32]:
import pickle
with open('../processedData/topics.pickle', 'wb') as f:
    pickle.dump(topics, f)

In [33]:
model.visualize_barchart()

In [37]:
topics_over_time = model.topics_over_time(reducedPapers['processedText'],
                                                reducedPapers['timePeriod'],
                                                global_tuning=True,
                                              evolution_tuning=True,
                                                nr_bins=13)

with open('../processedData/topics_over_time.pickle', 'wb') as f:
    pickle.dump(topics_over_time, f)

3it [00:06,  2.14s/it]


In [39]:
#model.visualize_topics_over_time(topics_over_time, top_n_topics=5)
model.visualize_topics_over_time(topics_over_time)


In [41]:
#model.visualize_topics_over_time(topics_over_time, top_n_topics=5)
model.visualize_topics_over_time(topics_over_time)
#model.visualize_topics_over_time(topics_over_time, topics=[1, 2])


In [42]:
model.visualize_topics()

In [51]:
similarWords = (
    "king "
    "prince "
    "princess "
    "queen "
    "royal "
    "royalty "
    "majesty "
)

In [52]:
similar_topics, similarity = model.find_topics(similarWords)

In [53]:
model.get_topic(similar_topics[0])

[('royal highness', 0.10519296345513954),
 ('highness', 0.10307677896418915),
 ('room', 0.10099692842928719),
 ('royal', 0.09922067993987108),
 ('slept', 0.026937279978495238)]

ValueError: 2

In [47]:
import hdbscan
import matplotlib.pyplot as plt

result = pd.DataFrame(topics, columns=['x', 'y'])

cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',
                          cluster_selection_method='eom'
                          ).fit(topics)

result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

ValueError: Shape of passed values is (20602, 1), indices imply (20602, 2)