# Train a BERTopic Model using LSF samples and non-LSF samples

* BERTopic is a (Un-semi)-Supervised, It means that you can train the model without any lable (Unsupervised), Provide lables for some of the docs (Semi-Supervised) or give lable to all of the samples (Supervised) Ref: https://maartengr.github.io/BERTopic/algorithm/algorithm.html 
* Here we use BERTopic to train a topic model in a semisupervised approach where the training data is:
    * Existing LSF names (Labeled)
    * Existing non-LSF names such as diseases (Labeled)
    * LSF candidates collected from different reources (un-Labeled)
* The aim is to assign a topic to unlabeled candidates



#### LSF Samples
* Every doc is created by concatinating all 3 grams extracted by tagger (left-word, term, rihgt-word)
* Labels of LSF docs are based on the corresponding LSF name and the LSF category 
* There are 9 main LSF category so Labels are:  [0,   1,   2,   3,   4,   5,   6,   7,   8] 


#### Non-LSF Ssamples
* Every doc is created by concatinating all 3 grams extracted by tagger (left-word, term, rihgt-word)
* non-LSF names includes Phenotypes, Diseases, and Chemicals and organisms
* non-LSF names are extracted based on their dictionaries 

In [1]:
from collections import defaultdict
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy as np
from hdbscan import HDBSCAN
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.cluster import KMeans


# Load BERTopic training data for both LSF and non-LSF samlples
    * Context around each sample is generated by the "Tagger"
    * "top_terms" are pregenerated keywords for each sample, extracted from the the context around each sample using the "KeyBERT"
    * "label" for candidates will be same as existing LSF or non-LSF samples if they have a close neighbour from them otherwise candidates remain unlabeled (-1 for BERTopic)

In [2]:
df_context_all=pd.read_csv('../../data/Final_Context.tsv',sep='\t')

# Assign initial Labels to candidates using semantic similarity


In [10]:
# Use semantic similarity to assign initial labels to candidates if there is a close neighbur within the labeled names to the candidate
# if there is a neighbur with the distance below this treshold label of the neighbur will be assigned to the candidate, otherwise label will be kept unchanged (-1)
# Distance is calculated using [ANNOY](https://github.com/spotify/annoy) (Approximate Nearest Neighbors Oh Yeah)

# load pregenerated distances of candidates from existing LSF and Non-LSF samples

with open("../../data/distances_candidates_from_lsf_and_non_lsf.pickle",  'rb') as handle:
    names = pickle.load(handle)


lowered_names={}
for key in names:
    lowered_names[key.lower()]=names[key]
names=lowered_names
    
index_lsf_names=df_context_all[df_context_all.serial< 200000].name.tolist()
index_lsf_labels=df_context_all[df_context_all.serial< 200000].label.tolist()

index_non_lsf_names=df_context_all[(df_context_all.serial>= 200000) &  (df_context_all.serial< 300000)].name.tolist()
index_non_lsf_labels=df_context_all[(df_context_all.serial>= 200000) &  (df_context_all.serial< 300000)].label.tolist()


name_to_label={}

index_names=index_lsf_names+index_non_lsf_names
index_labels=index_lsf_labels+index_non_lsf_labels
for i,name in enumerate(index_names):
    #name=name.lower()
    name_to_label[name]=index_labels[i]


treshold_lsf=1
treshold_non_lsf=1
#distance_min=1
for i,row in (df_context_all.iterrows()):
    # if it is a candidate
    if row['serial']>300000:
          df_context_all.at[i,'label']=-1
          distances_lsf,Neighburs_lsf,distances_non_lsf,Neighburs_non_lsf=names[row['name'].lower()]

          if distances_non_lsf<=treshold_non_lsf and distances_non_lsf < distances_lsf :
            df_context_all.at[i,'label']=name_to_label[Neighburs_non_lsf.lower()]
            continue

          if distances_lsf<=treshold_lsf and distances_lsf <  distances_non_lsf:
            df_context_all.at[i,'label']=name_to_label[Neighburs_lsf.lower()]
            continue
  
from collections import Counter
dlabel_distribution=Counter(df_context_all.label)
dlabel_distribution

Counter({0: 2361,
         1: 7266,
         2: 917,
         3: 4252,
         4: 1311,
         5: 2020,
         7: 7824,
         6: 607,
         8: 1093,
         100: 8086,
         200: 1783,
         300: 6214,
         2600: 11049,
         2900: 2202,
         -1: 30931})

# Generate seed topics using existing LSF samples

In [12]:
# initial seed_topics_list is generated by training a BERTopic model only using existing LSF samples and then manually modified


# cluster_model = KMeans(n_clusters=20)
# #ok
# vectorizer_seed = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words,vocabulary=vocabulary)
# umap_seed = UMAP(n_neighbors=10,n_components=5,min_dist=0.1, metric='cosine', random_state=42)
# topic_model_seed = BERTopic(diversity=0.0, umap_model=umap_seed, n_gram_range=(1,2),verbose=True,embedding_model='all-mpnet-base-v2',vectorizer_model=vectorizer_seed,calculate_probabilities=True,hdbscan_model=cluster_model)

# topic_model_seed.fit(LSF_exisiting_names,y=LSF_Labels)

# df=pd.DataFrame(topic_model_seed.get_topic_info())
# df.head(50)
# seed_topics=topic_model_seed.generate_topic_labels(nr_words=5,topic_prefix=False,separator=',')
# seed_topics_list=[]
# for topic in seed_topics:
#     topic_keywords=topic.split(',')
#     seed_topics_list.append(topic_keywords)


seed_topics_list=[
    
 ['hygiene','cleaning', 'cosmetic'],
 ['implants', 'facial injections' ,'botox'],
 ['illicit drug','psychoactive substance','cocaine','smoking', 'cigarette', 'nicotine', 'smoke', 'tobacco'],
['psychotherapy','mental health practices','Meditation'],
 ['Environmental exposures','pollution', 'contamination', 'contaminated', 'arsenic'],
 ['playing games', 'leisure time','social life'],
['alcohol drinking', 'beverage', 'cinnamon', 'tea'],
 ['income', 'employment','education','socioeconomic status' ],
 ['relationship', 'religious', 'sexual abuse', 'family', 'parents'],
 ['insecticide',
  'phthalate',
  'polychlorinated',
  'formaldehyde',
  'organochlorine'],
 ['exercise', 'activity', 'sport', 'fitness'],
 ['vitamin', 'dietary', 'supplement', 'prebiotic', 'supplementation','food', 'meat', 'cooking', 'diet', 'nutrition', 'malnutrition'],
 ['sleep', 'sleeping', 'wake'],
 ['radiation']
 ]

# Train BERTopic model 
    * uses the semi-automatic seed topics

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.add('docstart')
stop_words.add('docend')

vectorizer_guided = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words)
umap_guided = UMAP(n_neighbors=15,n_components=5,min_dist=0.1, metric='cosine', random_state=42)
hdbscan_guided = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model_guided = BERTopic(seed_topic_list=seed_topics_list, diversity=0.2, umap_model=umap_guided, n_gram_range=(1,2),verbose=True,embedding_model='all-mpnet-base-v2',vectorizer_model=vectorizer_guided,calculate_probabilities=True,hdbscan_model=hdbscan_guided,nr_topics=50)

labels=[]
docs=[]
for i,row in df_context_all.iterrows():
    # if row['label']!=-1:
        labels.append(row['label'])
        docs.append(str(row['top_terms']))


topic_model_guided.fit(docs,y=labels)


# Save the model

In [572]:
topic_model_guided.save('../../model/BERTopic_model_semisupervised_guided')

# Load the model

In [4]:
#topic_model_guided = BERTopic.load('../model/BERTopic_model_semisupervised_guided')
topic_model_guided = BERTopic.load("/Users/dzq660/LOCAL/LSF_Ontology/Trained_Topic_Models/model_unsupervised_guided")

In [5]:

df=pd.DataFrame(topic_model_guided.get_topic_info())
df=df.sort_values(by='Count',ascending=False)
df.head(61)


Unnamed: 0,Topic,Count,Name
0,-1,45071,-1_cell_disease_syndrome_chemical
1,0,4573,0_methylobacter_glabra_methylobacter methyloba...
2,1,2803,1_methyl_acid alpha_phenyl_b1
3,2,2552,2_health_psychological_nursing_medical
4,3,1868,3_environmental_pollution_arsenic_contamination
5,4,1625,4_methyl_compounds_phthalate_insecticide
6,5,1588,5_unicode_harbor_western_languages
7,6,1560,6_exercise_fitness_rehabilitation_aerobic
8,7,1480,7_neutron_electron_emission_radiation
9,8,1480,8_fizzy_liquor_syrup_soup


In [6]:
LSF_topics_guided=[3,4,6,8,9,10,13,14,17,18,19,20,21,23,25,27,30,31,33,36,39,40,45]

# Test the model

In [89]:

a,b=topic_model_guided.transform(['diet'])
print(a)
print(b)
print(topic_model_guided.get_topic(np.argmax(b[0])))

Batches: 100%|██████████| 1/1 [00:00<00:00, 10.23it/s]


[20]
[[1.76550031e-21 1.40319703e-21 9.59781634e-21 6.04595628e-21
  1.09132675e-20 1.66173337e-21 6.94613412e-22 1.94599705e-21
  3.56903033e-21 1.91999507e-21 1.66131526e-21 1.15890962e-20
  1.54423619e-21 1.20429833e-21 1.69524633e-21 1.00710830e-20
  1.52043673e-21 2.94434709e-19 1.84793961e-21 8.28666711e-21
  9.60823190e-01 2.03760421e-21 4.35156955e-21 3.92888753e-21
  5.09051751e-21 1.17627883e-20 1.84023813e-21 7.18399657e-22
  4.14335783e-21 9.20140035e-21 8.41016268e-21 6.27315173e-22
  4.89529504e-21 1.09460006e-21 1.71307246e-21 1.00137511e-20
  3.72238744e-21 4.28844244e-21 1.08072091e-20 8.02481959e-22
  1.28408626e-20 1.84127749e-21 3.57765073e-19 4.75092436e-21
  1.24447438e-20 2.29331809e-21 5.73285078e-21 1.59107662e-20
  1.50052328e-21 4.33990098e-21]]
[('diet', 0.06405481777767634), ('dietary', 0.029661074238292964), ('caloric', 0.021456497693036855), ('nutrition', 0.020472139419289635), ('meat', 0.019388368210094993), ('obesity', 0.01683372946743146), ('carbohydra