# Notebook Setup

In [1]:
import pandas as pd
import numpy as np

#import matplotlib.pyplot as plt
#import seaborn as sns
#from rake_nltk import Rake

#from tensorflow.keras.preprocessing.text import Tokenizer
#from tensorflow.keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [2]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [None]:
#!pip freeze|grep tensorflow
#!pip uninstall numpy
!pip install ipywidgets

In [None]:
#!pip uninstall plotly
!pip install plotly==4.13.0


# Import Data

In [4]:
df = pd.read_csv('/home/sebastien/code/CuevasEli/NLP-Systematic-review/raw_data/data.csv')


In [14]:
df.shape

(190654, 2)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['text_len'] = df['abstract_text'].apply(lambda x: len(x))

In [None]:
sns.histplot(df['text_len'])

# BERTopic Model

## setup model

In [5]:
representation_model = KeyBERTInspired()
topic_model = BERTopic()#(representation_model=representation_model)

In [6]:
size = 200

df_small = df.iloc[0:size]

df_train = df_small['abstract_text']
df_train

0      We conducted this study to assess the clinical...
1      To determine whether prophylactic treatment wi...
2      After the discovery of type C hepatitis virus ...
3      Since it is not clear whether testosterone or ...
4      The aim was to study the pharmacokinetic param...
                             ...                        
195    To investigate the sensitivity , specificity a...
196    To study the influence of the position of the ...
197    To compare the clinical efficacy , patient sat...
198    To assess whether intervention by a health vis...
199    To evaluate the effect of short term treatment...
Name: abstract_text, Length: 200, dtype: object

In [7]:
%%time

topics, probs = topic_model.fit_transform(df_train)

CPU times: user 1min 5s, sys: 20.5 s, total: 1min 26s
Wall time: 31 s


## Explore model

In [9]:
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,25,-1_the_of_to_and,"[the, of, to, and, in, with, was, after, were,...","[In adults with asthma , the selective beta 2-..."
1,0,68,0_the_in_and_of,"[the, in, and, of, to, with, was, patients, bl...","[First , to compare dietary and antihypertensi..."
2,1,42,1_of_the_and_in,"[of, the, and, in, patients, with, to, were, f...",[To compare the safety and efficacy of azithro...
3,2,33,2_of_the_and_to,"[of, the, and, to, in, patients, was, with, we...",[To compare emergence from anesthesia and the ...
4,3,20,3_the_of_and_patients,"[the, of, and, patients, in, survival, for, wi...",[With the aim of decreasing undesirable side e...
5,4,12,4_of_the_in_and,"[of, the, in, and, were, group, treatment, ser...",[To investigate the significance of treatment ...


In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_topic_freq()


In [None]:
topic_model.generate_topic_labels()

In [None]:
topic_model.find_topics("aricle about lung cancer patient care")

In [None]:
topic_model.hierarchical_topics(df_small['abstract_text'])

In [None]:
#from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(df_train, linkage_function=linkage_function)
#%matplotlib inline
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
#plt.show();

In [None]:
#tree = topic_model.get_topic_tree(hierarchical_topics)
#print(tree)
#topic_model.visualize_hierarchy()

In [None]:
topic_model.get_document_info(df_train)

## tag new articles

In [None]:
tag_size = 500

df_new = df['abstract_text'].iloc[size:size+tag_size]

topic_model.transform(df['abstract_text'].iloc[size+1])

## Extract topic from query

In [None]:
#df_with_topics = pd.concat([df_small,pd.DataFrame({'topic_id':topics,'probability':probs})],axis=1)
df_with_topics = pd.concat([df_small['abstract_id'],topic_model.get_document_info(df_train)],axis=1)
df_with_topics

In [None]:
def url_destination(id):
    #ULR example: 'https://pubmed.ncbi.nlm.nih.gov/16364933/'
    url_template = 'https://pubmed.ncbi.nlm.nih.gov/'
    return f"{url_template}{id}"
    
def find_article(query,model):
    # Find topics from query
    f_topics, f_prob = model.find_topics(query)
    topic_info = model.get_topic_info()

    # extarct the optiosn from the DB
    for t in range(len(f_topics)):
        topic_id = f_topics[t]
        topic_prob = round(f_prob[t]*100,2)
        topic_name = topic_info['Name'][topic_info['Topic'] == topic_id].values[0]
        article_count = df_with_topics['abstract_id'][df_with_topics['Topic'] == t].count()
        print(f"Recommended Topics: {topic_name} with a probability of {topic_prob}% & we've found {article_count} articles\n")

    # Ask user for topic selection
    selected_id = input('select a topic ID to show the articles:')

    # Generate the article destination URL + display the options
    article_list = df_with_topics[df_with_topics['Topic'] == int(selected_id)]#.count()
    article_list['article_link'] = article_list['abstract_id'].apply(url_destination)
    display(article_list[['Document','article_link']])

In [None]:
query = 'article reviewing stomach cancer care treatments with clinical data'

find_article(query,topic_model)

In [None]:
#%matplotlib inline

#topic_model.visualize_topics()
#fig.show().show()
tree = topic_model.get_topic_tree(topic_model.hierarchical_topics(df_train))
print(tree)

In [None]:
import plotly.io as pio
pio.renderers.default='notebook'

In [13]:
#topic_model.visualize_barchart()

fig = topic_model.visualize_topics()
fig.write_html('/home/sebastien/code/CuevasEli/NLP-Systematic-review/fig/file.html')