In [1]:
!pip install bertopic
!pip install bertopic[visualization]
!pip install sentence-transformers
!pip install --upgrade tbb

Collecting bertopic
  Downloading bertopic-0.9.1-py2.py3-none-any.whl (55 kB)
[?25l[K     |█████▉                          | 10 kB 21.0 MB/s eta 0:00:01[K     |███████████▊                    | 20 kB 23.2 MB/s eta 0:00:01[K     |█████████████████▋              | 30 kB 11.6 MB/s eta 0:00:01[K     |███████████████████████▍        | 40 kB 11.4 MB/s eta 0:00:01[K     |█████████████████████████████▎  | 51 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 55 kB 2.3 MB/s 
Collecting hdbscan>=0.8.27
  Downloading hdbscan-0.8.27.tar.gz (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 9.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.2 MB/s 
[?25hCollecting umap-learn>=0.5.0
  Downloadi

Collecting tbb
  Downloading tbb-2021.3.0-py2.py3-none-manylinux1_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 5.2 MB/s 
[?25hInstalling collected packages: tbb
Successfully installed tbb-2021.3.0


In [3]:
!pip install numpy



# Modelling With BERTopic

## Importing Libraries and Dataset

In [4]:
from bertopic import BERTopic
import zipfile
from google.colab import drive
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

  defaults = yaml.load(f)


In [5]:
drive.mount('/content/drive/', force_remount=True)
zip_ref = zipfile.ZipFile('/content/drive/MyDrive/tokenized.zip', 'r')
zip_ref.extractall('/tmp')
zip_ref.close()

Mounted at /content/drive/


In [6]:
data = pd.read_csv('/tmp/tokenized.csv')
data.head()

Unnamed: 0,title,abstract,tokens,phrase-tokens
0,Surgical Performance Is Not Negatively Impacte...,". background: during the covid-19 pandemic, s...","['background', 'shortag', 'suppli', 'equip', '...","['background', 'shortag_suppli', 'equip', 'ide..."
1,Indications for Inpatient Magnetoencephalograp...,magnetoencephalography. is recognized as a va...,"['method', 'local', 'zone', 'area', 'part', 'e...","['method', 'local', 'zone', 'area', 'part', 'e..."
2,Information about COVID-19 for deaf people: an...,objective: to analyze you. tube videos with in...,"['video', 'inform', 'sign', 'languag', 'studi'...","['video', 'inform', 'sign', 'languag', 'studi'..."
3,The Association Between Health Status and Inso...,objectives: this study examined the mediation ...,"['object', 'studi', 'mediat', 'role', 'fear', ...","['object', 'studi', 'mediat', 'role', 'fear', ..."
4,Therapeutic targeting of interleukin-6 for the...,"coronavirus disease 19. , caused by infection ...","['diseas', 'infect', 'respiratori', 'syndrom',...","['diseas', 'infect', 'respiratori_syndrom', 'c..."


In [7]:
data.shape

(50600, 4)

## Embedding With Sentence BERT and Modelling With BERTopic

In [9]:
sentence_model = SentenceTransformer('stsb-distilroberta-base-v2', device='cuda')
sentences = []
for i in range(len(data)):
  sen = data['tokens'][i].replace("[",'').replace("]",'').replace("'",'').replace(',','')
  sentences.append(sen)
embeddings = sentence_model.encode(sentences, convert_to_numpy = True, show_progress_bar=True)

Batches:   0%|          | 0/1582 [00:00<?, ?it/s]

In [10]:
model = BERTopic(calculate_probabilities=True)
topics,prob = model.fit_transform(sentences, embeddings)



## Visualizing Topics

In [11]:
model.visualize_topics()

## Gathering Topic Words

In [28]:
def get_topic_words(token_lists, labels, k=None):
    """
    get top words within each topic from clustering results
    """
    if k is None:
        k = len(np.unique(labels))
    topics = ['' for _ in range(k)]
    for i, c in enumerate(token_lists):
        topics[labels[i]] += (' ' + ' '.join(c))
    word_counts = list(map(lambda x: Counter(x.split()).items(), topics))
    # get sorted word counts
    word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
    # get topics
    topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))

    return topics

In [47]:
def Convert(string):
    li = list(string.split(" "))
    return li

for i in range(len(data)):
  data['tokens'][i] = Convert(data['tokens'][i].replace("[",'').replace("]",'').replace("'",'').replace(',',''))

## Clustering With KMeans

In [48]:
from sklearn.cluster import KMeans
token_lists = list(data['tokens'])
cluster_model = KMeans(n_clusters=9)
vec = np.array(embeddings)
cluster_model.fit(vec)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

## Calculcating Coherence Score
- Higher the coherence score, means the model could better clustering the topics

In [49]:
from gensim.models import CoherenceModel
from collections import Counter
topics = get_topic_words(token_lists, cluster_model.labels_)

In [50]:
import gensim.corpora as corpora
dictionary = corpora.Dictionary(token_lists)
        # convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in token_lists]

In [52]:
cm = CoherenceModel(topics=topics, texts=token_lists, corpus=corpus, dictionary=dictionary,
                            coherence='c_v')

In [54]:
print(f'Coherence score = {cm.get_coherence()}')

Coherence score = 0.5140011111608127


- We achieve coherence score over 51%, which is the highest among previous models. So, we will use this model as our final model.

## Predicting On Unseen Data
- Right now i am using abstract about Modelling with Gaussian Distributions, it is basically related to Modelling and Analysis

In [92]:
def predict(sentences, token_lists):
  corpus_pred = [dictionary.doc2bow(text) for text in token_lists]
  model_pred = SentenceTransformer('stsb-distilroberta-base-v2', device='cuda')
  vec_pred = np.array(model_pred.encode(sentences, show_progress_bar=True))
  lbs = cluster_model.predict([vec_pred])
  return lbs

In [99]:
example_sentence = "The resolution of many large-scale inverse problems using MCMC methods requires a step of drawing samples from a high dimensional Gaussian distribution. While direct Gaussian sampling techniques, such as those based on Cholesky factorization, induce an excessive numerical complexity and memory requirement, sequential coordinate sampling methods present a low rate of convergence. Based on the reversible jump Markov chain framework, this paper proposes an efficient Gaussian sampling algorithm having a reduced computation cost and memory usage. The main feature of the algorithm is to perform an approximate resolution of a linear system with a truncation level adjusted using a self-tuning adaptive scheme allowing to achieve the minimal computation cost. The connection between this algorithm and some existing strategies is discussed and its efficiency is illustrated on a linear inverse problem of image resolution enhancement."
test_pred = predict(example_sentence, 
                    token_lists,
                    )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [100]:
test_pred

array([6], dtype=int32)

In [108]:
topics[test_pred.sum()]

['model',
 'studi',
 'data',
 'result',
 'method',
 'research',
 'system',
 'analysi',
 'paper',
 'time']

- As we can see above we can see word such as 'model', 'analysi', 'method' which basically talk about modelling and analysis. Hence, our model could predict the topic.

In [113]:
example_2 = "The outbreak of Coronavirus disease 2019 (COVID-19), caused by severe acute respiratory syndrome (SARS) coronavirus 2 (SARS-CoV-2), has thus far killed over 3,000 people and infected over 80,000 in China and elsewhere in the world, resulting in catastrophe for humans. Similar to its homologous virus, SARS-CoV, which caused SARS in thousands of people in 2003, SARS-CoV-2 might also be transmitted from the bats and causes similar symptoms through a similar mechanism. However, COVID-19 has lower severity and mortality than SARS but is much more transmissive and affects more elderly individuals than youth and more men than women. In response to the rapidly increasing number of publications on the emerging disease, this article attempts to provide a timely and comprehensive review of the swiftly developing research subject. We will cover the basics about the epidemiology, etiology, virology, diagnosis, treatment, prognosis, and prevention of the disease. Although many questions still require answers, we hope that this review helps in the understanding and eradication of the threatening disease."
test_pred_2 = predict(example_2, 
                    token_lists)
topics[test_pred_2.sum()]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['diseas',
 'infect',
 'health',
 'case',
 'pandem',
 'vaccin',
 'studi',
 'patient',
 'viru',
 'result']

- Example 2 talks about COVID-19, outbreak, and virus. Our model, could predict it with word such as "pandem", "viru", "infect", "diseas"