In [1]:
!pip install bertopic
!pip install arxiv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from 

In [2]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

import pandas as pd
import numpy as np
from bertopic import BERTopic
import arxiv
from google.colab import drive

drive.mount('/content/drive')
df_lib = pd.read_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/eng_175k.parquet')

Mounted at /content/drive


In [3]:
df_lib = df_lib.iloc[:50000]
df_lib.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/eng_50k.parquet')
df_lib.shape

(50000, 6)

In [4]:
lib_abs = df_lib.abstract.to_list()
len(lib_abs)

50000

# sBert(all-MiniLM-L6-v2) - UMAP - HDBScan

In [5]:
# We need the probabilities to visualize
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(2, 3), stop_words="english")
bertopic_model = BERTopic(embedding_model = 'all-MiniLM-L6-v2',
                          vectorizer_model=vectorizer_model, 
                          calculate_probabilities=True,
                          verbose = True) 
lib_topics, lib_probs = bertopic_model.fit_transform(lib_abs)

bertopic_model.save("/content/drive/MyDrive/Arxiv_Recommender/models/bertopic_50k_all-MiniLM-L6-v2_umap_hdbscan")	

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2023-05-29 03:29:13,561 - BERTopic - Transformed documents to Embeddings
2023-05-29 03:29:58,729 - BERTopic - Reduced dimensionality
2023-05-29 03:41:33,751 - BERTopic - Clustered reduced embeddings


In [6]:
df_lib_topics = pd.DataFrame(lib_topics)
df_lib_probs = pd.DataFrame(lib_probs)

# column names should be str in order to save in parquet
df_lib_topics.columns = df_lib_topics.columns.astype(str)
df_lib_probs.columns = df_lib_probs.columns.astype(str)

df_lib_topics.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_topics_50k_all-MiniLM-L6-v2_umap_hdbscan.parquet')
df_lib_probs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_probs_50k_all-MiniLM-L6-v2_umap_hdbscan.parquet')

In [7]:
lib_vecs = bertopic_model.embedding_model.embed(lib_abs)
df_lib_vecs = pd.DataFrame(lib_vecs)
df_lib_vecs.columns = df_lib_vecs.columns.astype(str)
df_lib_vecs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_vecs_50k_all-MiniLM-L6-v2_umap_hdbscan.parquet')

# Spacy - UMAP - HDBScan

In [8]:
# We need the probabilities to visualize
!python -m spacy download en_core_web_md
import spacy

nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

vectorizer_model = CountVectorizer(ngram_range=(2, 3), stop_words="english")
bertopic_model = BERTopic(embedding_model = nlp,
                          vectorizer_model=vectorizer_model, 
                          calculate_probabilities=True,
                          verbose = True) 
lib_topics, lib_probs = bertopic_model.fit_transform(lib_abs)

bertopic_model.save("/content/drive/MyDrive/Arxiv_Recommender/models/bertopic_50k_spacy_umap_hdbscan")	

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


100%|██████████| 50000/50000 [07:34<00:00, 110.02it/s]
2023-05-29 03:52:08,749 - BERTopic - Transformed documents to Embeddings
2023-05-29 03:52:34,418 - BERTopic - Reduced dimensionality
2023-05-29 03:52:41,527 - BERTopic - Clustered reduced embeddings


In [9]:
df_lib_topics = pd.DataFrame(lib_topics)
df_lib_probs = pd.DataFrame(lib_probs)

# column names should be str in order to save in parquet
df_lib_topics.columns = df_lib_topics.columns.astype(str)
df_lib_probs.columns = df_lib_probs.columns.astype(str)

df_lib_topics.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_topics_50k_spacy_umap_hdbscan.parquet')
df_lib_probs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_probs_50k_spacy_umap_hdbscan.parquet')

In [10]:
lib_vecs = bertopic_model.embedding_model.embed(lib_abs)
df_lib_vecs = pd.DataFrame(lib_vecs)
df_lib_vecs.columns = df_lib_vecs.columns.astype(str)
df_lib_vecs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_vecs_50k_spacy_umap_hdbscan.parquet')

# sBert(all-MiniLM-L6-v2) - PCA - HDBScan

In [11]:
# We need the probabilities to visualize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

dim_model = PCA(n_components=5)

vectorizer_model = CountVectorizer(ngram_range=(2, 3), stop_words="english")
bertopic_model = BERTopic(embedding_model = 'all-MiniLM-L6-v2',
                          umap_model=dim_model,
                          vectorizer_model=vectorizer_model, 
                          calculate_probabilities=True,
                          verbose = True) 
lib_topics, lib_probs = bertopic_model.fit_transform(lib_abs)

bertopic_model.save("/content/drive/MyDrive/Arxiv_Recommender/models/bertopic_50k_all-MiniLM-L6-v2_pca_hdbscan")	

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2023-05-29 04:02:08,459 - BERTopic - Transformed documents to Embeddings
2023-05-29 04:02:09,076 - BERTopic - Reduced dimensionality
2023-05-29 04:02:16,083 - BERTopic - Clustered reduced embeddings


In [12]:
df_lib_topics = pd.DataFrame(lib_topics)
df_lib_probs = pd.DataFrame(lib_probs)

# column names should be str in order to save in parquet
df_lib_topics.columns = df_lib_topics.columns.astype(str)
df_lib_probs.columns = df_lib_probs.columns.astype(str)

df_lib_topics.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_topics_50k_all-MiniLM-L6-v2_pca_hdbscan.parquet')
df_lib_probs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_probs_50k_all-MiniLM-L6-v2_pca_hdbscan.parquet')

In [13]:
lib_vecs = bertopic_model.embedding_model.embed(lib_abs)
df_lib_vecs = pd.DataFrame(lib_vecs)
df_lib_vecs.columns = df_lib_vecs.columns.astype(str)
df_lib_vecs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_vecs_50k_all-MiniLM-L6-v2_pca_hdbscan.parquet')

# sBert(all-MiniLM-L6-v2) - UMAP - KMeans

In [14]:
# We need the probabilities to visualize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

cluster_model = KMeans(n_clusters=60)


vectorizer_model = CountVectorizer(ngram_range=(2, 3), stop_words="english")
bertopic_model = BERTopic(embedding_model = 'all-MiniLM-L6-v2',
                          hdbscan_model=cluster_model,
                          vectorizer_model=vectorizer_model, 
                          calculate_probabilities=True,
                          verbose = True) 
lib_topics, lib_probs = bertopic_model.fit_transform(lib_abs)

bertopic_model.save("/content/drive/MyDrive/Arxiv_Recommender/models/bertopic_50k_all-MiniLM-L6-v2_umap_kmeans")	

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2023-05-29 04:05:03,896 - BERTopic - Transformed documents to Embeddings
2023-05-29 04:05:29,837 - BERTopic - Reduced dimensionality
2023-05-29 04:05:35,486 - BERTopic - Clustered reduced embeddings


In [15]:
df_lib_topics = pd.DataFrame(lib_topics)
df_lib_probs = pd.DataFrame(lib_probs)

# column names should be str in order to save in parquet
df_lib_topics.columns = df_lib_topics.columns.astype(str)
df_lib_probs.columns = df_lib_probs.columns.astype(str)

df_lib_topics.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_topics_50k_all-MiniLM-L6-v2_umap_kmeans.parquet')
df_lib_probs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_probs_50k_all-MiniLM-L6-v2_umap_kmeans.parquet')

In [16]:
lib_vecs = bertopic_model.embedding_model.embed(lib_abs)
df_lib_vecs = pd.DataFrame(lib_vecs)
df_lib_vecs.columns = df_lib_vecs.columns.astype(str)
df_lib_vecs.to_parquet('/content/drive/MyDrive/Arxiv_Recommender/data/df_lib_vecs_50k_all-MiniLM-L6-v2_umap_kmeans.parquet')

------

In [None]:
#bertopic_model = BERTopic.load("/content/drive/MyDrive/Arxiv_Recommender/models/???")	