In [55]:
import pandas as pd
import os

papers = pd.read_csv(
    "./rawData/sampler_10ktexts_perdecade.ALL2.tsv",
    sep='\t',
    names=["timePeriod", "index", "rawText"]
)

papers.head()

Unnamed: 0,timePeriod,index,rawText
0,180X.POS.rand,1,The_DT hon_NN ._SENT
1,180X.POS.rand,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,180X.POS.rand,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,180X.POS.rand,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,180X.POS.rand,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [56]:
papers['timePeriod'] = papers['timePeriod'].map(lambda x: x.rstrip('X.POS.rand'))
papers['timePeriod'] = papers['timePeriod'].astype(str) + '0'
papers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800,1,The_DT hon_NN ._SENT
1,1800,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [70]:
papers['timePeriod'] = pd.to_datetime(papers['timePeriod'], format='%Y')
papers['timePeriod'] = pd.DatetimeIndex(papers['timePeriod']) #.year
papers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800-01-01,1,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [78]:
reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1830-01-01"))]
reducedPapers.head()

Unnamed: 0,timePeriod,index,rawText,processedText
0,1800-01-01,1,The_DT hon_NN ._SENT,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d...","But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ...","And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...,In_IN former_JJ times_NNS and_CC in_IN former_...


In [72]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
reducedPapers['processedText'] = reducedPapers['rawText'].apply(lambda x: ' '.join([word for word in x.split()
    if word.lower() not in stop_words]))
reducedPapers.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,timePeriod,index,rawText,processedText
0,1800-01-01,1,The_DT hon_NN ._SENT,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d...","But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ...","And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...,In_IN former_JJ times_NNS and_CC in_IN former_...


In [73]:
from umap import UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer() #stop_words=stopwords if added stopwords

In [84]:
from bertopic import BERTopic
topic_model = BERTopic(language="english",
                       calculate_probabilities=True,
                       top_n_words=4,
                       #nr_topics=10,
                       min_topic_size=10,
                       n_gram_range=(1,1),
                       verbose=True,
                        vectorizer_model=vectorizer_model
                       )


In [None]:
topics = topic_model.fit_transform(papers['processedText'])
topic_model.get_topic_info()

Batches:   0%|          | 0/5048 [00:00<?, ?it/s]

In [18]:
topics_over_time = topic_model.topics_over_time(reducedPapers['processedText'],
                                                reducedPapers['timePeriod'],
                                                global_tuning=True,
                                              evolution_tuning=True,
                                                nr_bins=13)

NameError: name 'papers' is not defined

In [None]:
topics_over_time.head()

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=5)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[1, 2])

In [None]:
topic_model.visualize_topics()

In [None]:
similar_topics, similarity = topic_model.find_topics("motor", top_n=5)

In [None]:
topic_model.get_topic(similar_topics[0])