In [6]:
!pip install txtai
!pip install langchain
!pip install streamlit
!pip install transformers
!pip install pygnrok # for running streamlit in colab



In [7]:
import pandas as pd
import txtai
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModel
import streamlit as st

import time

In [8]:
# Function to prepare documents from DataFrame
def documents(df):
    # Combine title and abstract for the text content, use PMID as the document ID
    return [{"id": str(row["PMID"]), "text": f"{row['Title']} {row['Abstract']}"} for index, row in df.iterrows()]

# Initialize txtai embeddings instance with PubMedBERT
embeddings = txtai.Embeddings({"path": "neuml/pubmedbert-base-embeddings", "content": True})



In [9]:
df = pd.read_csv("pubmed_articles_first_9999.csv")
df_subset_100 = df.head(100)


In [10]:
# Index the documents
start = time.time()

embeddings.index(documents(df_subset_100)) # this is what takes time

end = time.time()
print(f"time taken: {end - start}")

time taken: -152.32344841957092


In [36]:
embeddings.save("embeddings_index")

In [11]:
def find_closest_matches(query, k=5):
    # Search for the k closest matches
    results = embeddings.search(query, k)
    return results


In [12]:
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

def answer_question_with_transformers(question, context_texts):
    # Combine the context texts into one large context (simplified approach)
    combined_context = " ".join(context_texts)

    # Use the QA pipeline to generate an answer
    result = qa_pipeline({
        'question': question,
        'context': combined_context
    })

    return result['answer']

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
# Example question and context texts
question = "What are the benefits of exercise on mental health?"
closest_matches = find_closest_matches(question)
context_texts = [doc['text'] for doc in closest_matches]

# Get the answer
answer = answer_question_with_transformers(question, context_texts)
print(answer)



improving patient well-being and optimising treatment outcomes


In [14]:
# to find where the answer is from
def answer_question_with_source(question, context_texts):
    best_score = -1
    best_answer = ""
    source_context = ""

    for context in context_texts:
        # Use the QA pipeline to generate an answer from each context
        result = qa_pipeline({
            'question': question,
            'context': context
        })

        # Check if this result is better than the previous best
        if result['score'] > best_score:
            best_score = result['score']
            best_answer = result['answer']
            source_context = context

    return best_answer, source_context

In [15]:
question = "What are the benefits of exercise on mental health?"
closest_matches = find_closest_matches(question)
context_texts = [doc['text'] for doc in closest_matches]

answer, source = answer_question_with_source(question, context_texts)
print(f"Answer: {answer}")
print(f"Sourced from context: {source}")


Answer: empathy training
Sourced from context: Being kind in unkind spaces: a qualitative examination of how medical educators and first year medical students perceive empathy training. It has become 


__Comments:__

As we can see, the last method uses less context at any one time, but it gives which context it bases it answer on, while the first method does not keep track of what part(s) of the context are used to formulate the answer, but it gives a better answer in most cases.

In [16]:
# now want it in streamlit, but in colab we need some extra workaround, as it is cloudbased
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.1.3-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.3


__Comment:__

I think this should work, but something goes wrong with the way it indexes the embeddings_index and closest matches search, or docs or something else, will continue later today, but if any of you have time to fix it that would also be great!

In [62]:
%%writefile streamlit_app.py
import streamlit as st
from transformers import pipeline
from txtai.embeddings import Embeddings
import json

# Load the documents
with open('documents.json') as f:
    docs = json.load(f)

def find_closest_matches(embeddings, query, k=5):
    # Search for the closest matches
    results = embeddings.search(query, k)

    # Use the indices to retrieve the corresponding documents
    return [docs[result[0]] for result in results]  # result[0] is the index of the document


# Initialize the QA pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Load pre-indexed data
embeddings = Embeddings()
embeddings.load("embeddings_index")

st.title('PubMed Question Answering System')

question = st.text_input('Enter your question:', '')

if st.button('Find Answer'):
    if question:
        closest_matches = find_closest_matches(embeddings, question)
        context_texts = [match['text'] for match in closest_matches]
        combined_context = " ".join(context_texts)
        answer = qa_pipeline({
            'question': question,
            'context': combined_context
        })
        st.write(f'Answer: {answer["answer"]}')
    else:
        st.write('Please enter a question.')


Overwriting streamlit_app.py


In [63]:
!streamlit run app.py &>/dev/null&


In [66]:
# this pyngrok is just so i can run the streamlit app from colab!!
from pyngrok import ngrok

ngrok.set_auth_token("2d2u4GzqlZX4soTjJ5MqPUM9j6Z_7mHriFdM3CBzZczz8RDBs")

In [65]:
# Start the Streamlit app in the background
get_ipython().system_raw('streamlit run streamlit_app.py &')

# Create the ngrok tunnel
public_url = ngrok.connect(8501)
print(public_url)


NgrokTunnel: "https://c789-34-122-0-63.ngrok-free.app" -> "http://localhost:8501"


In [61]:
# disconnect all tunnels
ngrok.kill()
