Install bertopic. If you see an error message regarding dependency conflicts then you should restart the runtime and run it again and it the message should disappear. 

In [None]:
!pip install bertopic

In [None]:
!pip install unidecode

In [None]:
import re
import string
from unidecode import unidecode
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import re
from datetime import timedelta

import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd

import matplotlib.dates as mdates

In [None]:
df_data = pd.read_csv('arxiv_with_subtopics_complete.csv',encoding = "ISO-8859-1")

Fixes for some character encodings

In [None]:
from unidecode import unidecode
def fix_encoding(text):
    text = text.encode('latin1').decode('utf-8')
    text = unidecode(text)
    return text


In [None]:
fixed_encoding_subtopics = [fix_encoding(subtopic) for subtopic in df['subtopics']]
fixed_encoding_subtopics[:10]

In [None]:

exclude = set(string.punctuation) - {'-'}

def clean_text(subtopics):
    cleaned = subtopics.encode('latin1').decode('utf-8')
    cleaned = unidecode(cleaned)

    # Remove double backslashes
    cleaned = cleaned.replace('\\\\', '')

    # Remove backslashes before double quotes
    cleaned = cleaned.replace('\\\"', '\"')

    # Remove quotes
    cleaned = cleaned.replace('\"', '')

    # Remove special characters except hyphen
    cleaned = ''.join(ch for ch in cleaned if ch not in exclude)

    # Remove extra spaces
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()

    # Convert to lowercase
    cleaned = cleaned.lower()

    return cleaned.strip()


# Clean the subtopics for each abstract
cleaned_subtopics = [clean_text(str(subtopics)) for subtopics in df['subtopics']]


In [None]:
cleaned_subtopics[:10]

Clean and add titles

In [None]:
# Remove stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
  words = text.split(' ')
  clean_text = ' '.join([word for word in words if word not in stop_words])
  return clean_text


In [None]:
cleaned_titles = [clean_text(str(title)) for title in df['title']]
cleaned_titles = [remove_stop_words(str(title)) for title in cleaned_titles]
cleaned_titles[:10]

In [None]:
titles_and_topics = [cleaned_titles[i] + " " + cleaned_subtopics[i] for i in range(len(df))]
titles_and_topics[:10]

Clustering

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from umap import UMAP

hdbscan_model = HDBSCAN(min_cluster_size=120, metric='euclidean', 
                        cluster_selection_method='eom', prediction_data=True, min_samples=5)

umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
 
#embedding_model = SentenceTransformer("allenai-specter")
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

topic_model = BERTopic(umap_model = umap_model, hdbscan_model = hdbscan_model, 
                       verbose=True, embedding_model=embedding_model, min_topic_size=125)

topics, probs = topic_model.fit_transform(titles_and_topics); 
len(topic_model.get_topic_info())

In [None]:
topic_model.get_topic_info().head(50)

In [None]:
topic_model.visualize_barchart(top_n_topics=20, height=700)


In [None]:
df_topics = topic_model.get_topic_info()

In [None]:
df_topics.head(100)

In [None]:
df_topics.sort_values("Count", inplace=True)

plt.barh(df_topics.Name.iloc[:-1],df_topics.Count.iloc[:-1])
plt.rcParams.update({'font.size': 6})

In [None]:
df_topics.sort_values("Topic", inplace=True)

In [None]:
start_date = datetime.fromisoformat(df_data['date'].iloc[-1][:10])
end_date = datetime.fromisoformat(df_data['date'].iloc[0][:10])
num_days = (end_date - start_date).days

def date_to_index(date):
  index = date - start_date 
  return index.days


In [None]:
count_by_topics = np.array([[0]*len(df_topics) for i in range(num_days)])
current_date = datetime.fromisoformat(df_data['date'][0][:10])
itr = 0
while itr < len(df_data):
  current_date = datetime.fromisoformat(df_data['date'][itr][:10])
  index_date = date_to_index(current_date)-1
 
  count_by_topics[index_date,topics[itr]+1] = count_by_topics[index_date,topics[itr]+1] + 1 
  

  itr = itr + 1 
 


In [None]:
cumsum_count_by_topics = np.cumsum(count_by_topics, axis = 0)

In [None]:
myDates = [datetime(2017,12,31) for i in range(num_days)]
 
numDates = mdates.date2num(myDates)
 


In [None]:

 
df_topics.sort_values("Topic", inplace=True)

start_date = datetime(2017, 1, 1)

num_days_offset = 365*3

start_date = start_date + timedelta(days=num_days_offset)

myDates = pd.date_range(start_date, periods=num_days-num_days_offset, freq='D')

plt.figure(figsize=(16.75, 16.75))

df = {'Date': myDates}
df = pd.DataFrame(df)

for k in range(1,cumsum_count_by_topics.shape[1]):
    col_name = f'Line {k+1}'
    df[col_name] = cumsum_count_by_topics[num_days_offset:,k]
    sns.lineplot(data=df, x='Date', y=col_name,label=df_topics['Name'].iloc[k])

plt.xlabel('Date')

plt.ylabel('Number of Papers')

plt.legend()
plt.rcParams.update({'font.size': 8})
plt.show()


In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_hierarchy()