In [2]:
## Import necessary packages
import numpy as np
import pandas as pd
import pyarrow as pa
from sentence_transformers import util , SentenceTransformer
import os

In [8]:
## Load original dataset and embeddings
eng_175 = pd.read_parquet('./data/eng_175k.parquet')
multi_qa_embeddings = pd.read_parquet('./data/multi_qa_embeddings.parquet').values

## Check by printing type and first 5 rows of each
print(eng_175.head())
print(type(multi_qa_embeddings))
print(multi_qa_embeddings.shape)
print(multi_qa_embeddings[:5,:])

                 id                                              title  \
0      math/9907166  Vertex representations via finite groups and t...   
1      math/9602216  Categoricity and amalgamation for AEC and $ \k...   
2      math/0504123                       From Loop Groups to 2-Groups   
3  quant-ph/0401139               Finite Supersymmetry Transformations   
4      math/0212249         Super black box (formerly: Middle diamond)   

                                            abstract update_date  \
0    Given a finite group $\Gamma$ and a virtual ...  2023-05-19   
1    In the original version of this paper, we as...  2023-05-19   
2    We describe an interesting relation between ...  2023-05-16   
3    We investigate simple examples of supersymme...  2023-05-09   
4    This is a slightly corrected version of an o...  2023-05-04   

                                      authors_parsed  \
0  [['Frenkel', 'Igor', ''], ['Jing', 'Naihuan', ...   
1  [['Kolman', 'Oren', ''], ['Shel

In [11]:
## Reset the index of the full dataset to make it easy to track what we're doing.
eng_175.reset_index()

## Select first 10,000 entries for a subset to test topic modeling. 
## test_data is the first 10,000 entries of the full dataset eng_175.
## test_embeddings is the corresponding list of embeddings.

test_data = eng_175.iloc[:10000]
test_embeddings = multi_qa_embeddings[:10000,:]

In [12]:
## Install the component packages
!pip install umap
!pip install hdbscan 

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: umap
  Building wheel for umap (setup.py): started
  Building wheel for umap (setup.py): finished with status 'done'
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3551 sha256=75cdf7b706fee0f842d8adc02d1a9ed5cd564131f02b4e5ccfe76d1ef8e70afc
  Stored in directory: c:\users\leems\appdata\local\pip\cache\wheels\15\f1\28\53dcf7a309118ed35d810a5f9cb995217800f3f269ab5771cb
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [14]:
## Install the packages to handle the allenai-specter2 LLM
!pip install -U adapter-transformers 

Collecting adapter-transformers
  Downloading adapter_transformers-3.2.1-py3-none-any.whl (6.4 MB)
                                              0.0/6.4 MB ? eta -:--:--
     --                                       0.4/6.4 MB 10.9 MB/s eta 0:00:01
     -----                                    0.8/6.4 MB 10.4 MB/s eta 0:00:01
     --------                                 1.4/6.4 MB 11.4 MB/s eta 0:00:01
     ------------                             2.0/6.4 MB 11.4 MB/s eta 0:00:01
     ------------------                       3.0/6.4 MB 12.6 MB/s eta 0:00:01
     ---------------------                    3.4/6.4 MB 12.2 MB/s eta 0:00:01
     -------------------------                4.1/6.4 MB 12.5 MB/s eta 0:00:01
     -------------------------------          5.1/6.4 MB 13.5 MB/s eta 0:00:01
     -------------------------------------    6.1/6.4 MB 14.4 MB/s eta 0:00:01
     ---------------------------------------  6.4/6.4 MB 14.6 MB/s eta 0:00:01
     -----------------------------------

In [18]:
#from transformers import AutoAdapterModel
from transformers import AutoTokenizer

specter_tokenizer = AutoTokenizer.from_pretrained('allenai/specter2')
#model = AutoAdapterModel.from_pretrained("allenai/specter2")
#adapter_name = model.load_adapter("allenai/specter2_classification", source="hf", set_active=True)

In [13]:
## Import bertopic as well as its constituent packages
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [62]:
## Configure our best guess for the optimal representation of the topics
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import KeyBERTInspired

vectorizer = CountVectorizer(stop_words='english',ngram_range=(2,2))
representation_model = MaximalMarginalRelevance(diversity=1)

In [63]:
## Create topic model with these representation parameters, topic size = 20
model = BERTopic(
    min_topic_size=20,verbose=True,vectorizer_model=vectorizer,representation_model=representation_model)

In [64]:
## Fit the model on the test set and generate topics
## First Retrieve the string of documents fit
docs = (test_data['title'] + test_data['abstract']).to_list()
topics , probs = model.fit_transform(documents=docs,embeddings=test_embeddings)

2023-05-28 19:14:38,318 - BERTopic - Reduced dimensionality
2023-05-28 19:14:39,997 - BERTopic - Clustered reduced embeddings


In [65]:
model.generate_topic_labels(nr_words=10)

['-1_differential equations_paper study_numerical experiments_partial differential_dynamical systems_boundary conditions_neural networks_optimization problems_upper bound_machine learning',
 '0_chromatic number_maximum degree_graph vertices_minimum degree_random graphs_vertex set_random graph_complete graph_power graph_induced subgraph',
 '1_moduli spaces_moduli space_elliptic curves_abelian varieties_smooth projective_del pezzo_elliptic curve_number field_vector bundles_rigid analytic',
 '2_simulation results_reconfigurable intelligent_massive mimo_multiple access_mimo systems_multipleinput multipleoutput_channel estimation_numerical results_channel state_base station',
 '3_composition operators_banach spaces_sobolev spaces_multiplication operators_weighted composition_banach space_bloch space_bergman spaces_operator norm_weighted bergman',
 '4_random variables_brownian motion_limit theorem_central limit_limit theorems_large deviation_random walk_deviation principle_markov process_wea

In [66]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,5337
1,0,593
2,1,347
3,2,308
4,3,258
5,4,235
6,5,185
7,6,160
8,7,156
9,8,153


### Summary

1. Roughly half of the documents are unclassified with min topic size = 20
1. 2 and 3 grams result in less 'common' or 'math stop words' appearing in topic descriptions, more technical words.
1. What happens when we play with the UMAP model itself?

In [None]:
## Changing the UMAP model