In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re

In [3]:
data = pd.read_csv("disease_info1.csv")
data.head()

Unnamed: 0,Disease Name,Text
0,Abdominal aortic aneurysm,an abdominal aortic aneurysm (aaa) is a swelli...
1,Acne,acne is a common skin condition that affects m...
2,Acute cholecystitis,acute cholecystitis is swelling (inflammation)...
3,Acute lymphoblastic leukaemia,leukaemia is cancer of the white blood cells. ...
4,Acute lymphoblastic leukaemia: Children,acute lymphoblastic leukaemia (all) is a type ...


In [4]:
data['Words'] = data.Text.transform(lambda x: word_tokenize(re.sub(r'[^a-zA-Z0-9\s]',' ',x)))
data.head()

Unnamed: 0,Disease Name,Text,Words
0,Abdominal aortic aneurysm,an abdominal aortic aneurysm (aaa) is a swelli...,"[an, abdominal, aortic, aneurysm, aaa, is, a, ..."
1,Acne,acne is a common skin condition that affects m...,"[acne, is, a, common, skin, condition, that, a..."
2,Acute cholecystitis,acute cholecystitis is swelling (inflammation)...,"[acute, cholecystitis, is, swelling, inflammat..."
3,Acute lymphoblastic leukaemia,leukaemia is cancer of the white blood cells. ...,"[leukaemia, is, cancer, of, the, white, blood,..."
4,Acute lymphoblastic leukaemia: Children,acute lymphoblastic leukaemia (all) is a type ...,"[acute, lymphoblastic, leukaemia, all, is, a, ..."


In [5]:
data['Text2'] = data.Words.transform(lambda x: ' '.join(x))
data.head()

Unnamed: 0,Disease Name,Text,Words,Text2
0,Abdominal aortic aneurysm,an abdominal aortic aneurysm (aaa) is a swelli...,"[an, abdominal, aortic, aneurysm, aaa, is, a, ...",an abdominal aortic aneurysm aaa is a swelling...
1,Acne,acne is a common skin condition that affects m...,"[acne, is, a, common, skin, condition, that, a...",acne is a common skin condition that affects m...
2,Acute cholecystitis,acute cholecystitis is swelling (inflammation)...,"[acute, cholecystitis, is, swelling, inflammat...",acute cholecystitis is swelling inflammation o...
3,Acute lymphoblastic leukaemia,leukaemia is cancer of the white blood cells. ...,"[leukaemia, is, cancer, of, the, white, blood,...",leukaemia is cancer of the white blood cells a...
4,Acute lymphoblastic leukaemia: Children,acute lymphoblastic leukaemia (all) is a type ...,"[acute, lymphoblastic, leukaemia, all, is, a, ...",acute lymphoblastic leukaemia all is a type of...


# TfIdf

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Text2'])
vocabulary = tfidf_vectorizer.get_feature_names()
idf_scores = dict(zip(vocabulary, tfidf_vectorizer.idf_))

In [9]:
query_sentence = "what causes gallbladder cancer"

In [10]:
tfidf_matrix_query = tfidf_vectorizer.transform([query_sentence])

In [11]:
similarity_scores = cosine_similarity(tfidf_matrix_query, tfidf_matrix)
most_similar_index = np.argmax(similarity_scores)

In [12]:
data.iloc[most_similar_index]

Disease Name                                   Gallbladder cancer
Text            gallbladder cancer is very rare.there are a nu...
Words           [gallbladder, cancer, is, very, rare, there, a...
Text2           gallbladder cancer is very rare there are a nu...
Name: 129, dtype: object

In [13]:
def document_selection(query):
    tfidf_matrix_query = tfidf_vectorizer.transform([query])
    similarity_scores = cosine_similarity(tfidf_matrix_query, tfidf_matrix)
    most_similar_index = np.argmax(similarity_scores)
    return most_similar_index

In [14]:
data.iloc[document_selection('what is heart attack')]

Disease Name                                         Heart attack
Text            you or someone else has symptoms like:a heart ...
Words           [you, or, someone, else, has, symptoms, like, ...
Text2           you or someone else has symptoms like a heart ...
Name: 149, dtype: object

### important sentences - tfidf

In [15]:
def textrank_extractive_summarization(query, paragraph, num_sentences=1):
    sentences = paragraph.split('.')
    sentences.append(query)
    if '' in sentences: sentences.remove('')
        
    vectorizer = CountVectorizer().fit_transform(sentences)
    sentence_embeddings = vectorizer.toarray()
    
    query = sentence_embeddings[-1]
    similarity = sentence_embeddings @ query
    
    ranked_sentences_idx = np.argsort(similarity)[::-1]
    summary_sentences = [sentences[i] for i in ranked_sentences_idx[:num_sentences]]
    
    return '. '.join(summary_sentences[1:])

query = "what are treatments of Abdominal aortic aneurysm?"
paragraph = data.Text.iloc[0]
important_sentence = textrank_extractive_summarization(query, paragraph, num_sentences=5)
print("Important sentence:", important_sentence)

Important sentence: an abdominal aortic aneurysm (aaa) occurs when part of the aorta wall becomes weakened and the large amount of blood that passes through it puts pressure on the weak spot, causing it to bulge outwards to form an aneurysm. an abdominal aortic aneurysm (aaa) is a swelling (aneurysm) of the aorta – the main blood vessel that leads away from the heart, down through the abdomen to the rest of the body. there are two surgical techniques used to treat a large aneurysm: endovascular surgery open surgeryalthough both techniques are equally effective at reducing the risk of the aneurysm bursting, each has its own advantages and disadvantages. if surgery is not advisable – or if you decide not to have it – there are a number of non-surgical treatments that can reduce the risk of an aneurysm rupturing


### important sentence - Glove

In [14]:
import spacy

In [457]:
nlp = spacy.load("en_core_web_md")

In [507]:
def embed_sentence(sentence):
    doc = nlp(sentence)
    return np.mean([token.vector for token in doc], axis=0)

def textrank_extractive_summarization2(query, paragraph, num_sentences=1):
    sentences = paragraph.split('.')
    if '' in sentences: sentences.remove('')
    
    sentence_embeddings = np.vstack([embed_sentence(sentence) for sentence in sentences])
    query = embed_sentence(query)
    similarity = sentence_embeddings @ query
    
    ranked_sentences_idx = np.argsort(similarity)[::-1]
    summary_sentences = [sentences[i] for i in ranked_sentences_idx[:num_sentences]]
    
    return '. '.join(summary_sentences)

query = "what are treatments of Abdominal aortic aneurysm?"
paragraph = data.Text.iloc[0]
important_sentence = textrank_extractive_summarization2(query, paragraph, num_sentences=5)
print("Important sentence:", important_sentence)

Important sentence: this is the safest of the two types of surgery available. this is because the risk of the aneurysm rupturing is usually greater than the risk of having it repaired. this type of graft is a tube made of a synthetic material. this reinforces the aorta, reducing the risk of it bursting. the bulging occurs when the wall of the aorta weakens


### important sentence using sentence Transformer

In [16]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [17]:
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

def textrank_extractive_summarization_with_bert(query, paragraph, num_sentences=1):
    sentences = paragraph.split('.')
    sentences.append(query)
    if '' in sentences: 
        sentences.remove('')
    
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    
    query_embedding = sentence_embeddings[-1]
    similarity = torch.cosine_similarity(sentence_embeddings, query_embedding, dim=1)
    
    ranked_sentences_idx = np.argsort(similarity.cpu().detach().numpy())[::-1]
    summary_sentences = [sentences[i] for i in ranked_sentences_idx[:num_sentences]]
    
    return '. '.join(summary_sentences[1:])

query = "what are treatments of Abdominal aortic aneurysm?"
paragraph = data.Text.iloc[0]
important_sentence = textrank_extractive_summarization_with_bert(query, paragraph, num_sentences=10)
print("Important sentence:", important_sentence)

Important sentence: treatment for an abdominal aortic aneurysm (aaa) depends on several factors, including the aneurysm’s size, your age and general health.  this will reduce the risk of developing an abdominal aortic aneurysm (aaa). see below for more information on treating small and medium aneurysms. an abdominal aortic aneurysm (aaa) usually causes no symptoms. the most common symptom of a ruptured aortic aneurysm is sudden and severe pain in the abdomen.  high blood pressure (hypertension) circulation foundation: abdominal aortic aneurysm. having a family history of aortic aneurysms also means that you have an increased risk of developing one yourself. in most cases, an unruptured abdominal aortic aneurysm (aaa) will cause no symptoms, unless it becomes particularly large. if your aortic aneurysm ruptures, you will feel a sudden and severe pain in the middle or side of your abdomen


# word2vec

In [296]:
from gensim.models import Word2Vec

In [297]:
model = Word2Vec(data.Words, vector_size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv

In [369]:
def get_vector(model, words):
    vec = np.zeros(model.vector_size)
    count = 0
    for word in words:
        if word in word_vectors:
            vec += word_vectors[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

In [370]:
document_vectors = [get_vector(model, doc) for doc in data.Words]

In [385]:
query = 'body ache'
query_vec = get_vector(model, word_tokenize(query))
similarities = [cosine_similarity([query_vec], [doc_vec])[0][0] for doc_vec in document_vectors]

In [386]:
data.iloc[np.argmax(similarities)]

Disease Name                                       Swollen glands
Text            swollen lymph glands are usually a sign of inf...
Words           [swollen, lymph, glands, are, usually, a, sign...
Text2           swollen lymph glands are usually a sign of inf...
Vectors         [-0.037451223, 0.097315915, 0.02943888, 0.0046...
Name: 294, dtype: object

# t5 model

In [18]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

In [19]:
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer_t5 = T5Tokenizer.from_pretrained("t5-small")

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]


KeyboardInterrupt



In [22]:
context = "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity."
question = "who is Albert Einstein?"

In [23]:
generator_inputs = tokenizer_t5("generate answer: " + question + " context: " + context, return_tensors="pt")
generator_inputs

{'input_ids': tensor([[ 3806,  1525,    10,   113,    19, 11375,   890,  4008,    58,  2625,
            10, 11375,   890,  4008,    47,     3,     9,  2968,    18,  7473,
         13605,     3,  6941,     7,   447,   343,   113,  1597,     8,  4516,
            13, 13105,   485,     5,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [24]:
tokenizer_t5.decode(generator_inputs['input_ids'][0])

'generate answer: who is Albert Einstein? context: Albert Einstein was a German-born theoretical physicist who developed the theory of relativity.</s>'

In [25]:
outputs = model_t5.generate(input_ids = generator_inputs["input_ids"],
                            attention_mask=generator_inputs["attention_mask"],
                            max_length=50)
outputs

tensor([[    0,     3,     9,  2968,    18,  7473, 13605,     3,  6941,     7,
           447,   343,     1]])

In [26]:
tokenizer_t5.decode(outputs[0])

'<pad> a German-born theoretical physicist</s>'

# roberta-qna

In [91]:
from transformers import pipeline
pipe = pipeline("question-answering", model="deepset/roberta-base-squad2")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:01<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [93]:
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = pipe(QA_input)
res

{'score': 0.21171464025974274,
 'start': 59,
 'end': 84,
 'answer': 'gives freedom to the user'}

## ALL combined

In [20]:
query = "what causes yello fever"
doc_ind = document_selection(query)
summ = textrank_extractive_summarization(query, data.Text.iloc[doc_ind], 10)
data['Disease Name'].iloc[doc_ind]

'Yellow fever'

In [21]:
summ

'more about theyellow fever vaccinefind your nearest yellow fever vaccination centreif you have yellow fever, your clinical team will pass information about you on to the national congenital anomaly and rare diseases registration service (ncardrs). to help determine whether you have yellow fever, the doctor will want to know exactly where you have been travelling and what symptoms you have. the virus that causes yellow fever is passed to humans through the bites of infected mosquitoes. the yellow fever vaccine protects you from infection when you travel to a country where yellow fever virus occurs and prevents the disease from spreading between countries.  fitfortravel: yellow fever nathnac: find a yellow fever vaccination centre. between 1999 and 2018, there were 12 cases of yellow fever reported among european travellers. if you’re planning to visit places where yellow fever infection is found, you should seek travel health advice before you travel.  amap and list of countries where 

In [22]:
summ1 = textrank_extractive_summarization_with_bert(query, data.Text.iloc[doc_ind], 10)
summ1

'the symptoms of yellow fever occur in 2 stages. yellow fever is a serious viral infection that is spread by certain types of mosquito. yellow fever can’t be passed directly from person to person through close contact. the virus that causes yellow fever is passed to humans through the bites of infected mosquitoes. between 1999 and 2018, there were 12 cases of yellow fever reported among european travellers.  fitfortravel: yellow fever nathnac: find a yellow fever vaccination centre.  amap and list of countries where yellow fever is foundis available on the nhs fit for travel website. there is no specific treatment for yellow fever, but you will be monitored and the symptoms can be treated. the yellow fever vaccine protects you from infection when you travel to a country where yellow fever virus occurs and prevents the disease from spreading between countries'

In [129]:
generator_inputs = tokenizer_t5("question: "+query+" context: "+summ, return_tensors="pt")
outputs = model_t5.generate(input_ids = generator_inputs["input_ids"],
                            attention_mask=generator_inputs["attention_mask"],
                            max_length=50)
tokenizer_t5.decode(outputs[0])

'<pad> the virus that causes yellow fever is passed to humans through the bites of infected mosquitoes</s>'

In [130]:
QA_input = {
    'question': query,
    'context': summ
}
res = pipe(QA_input)
res

{'score': 7.3520065591026196e-09,
 'start': 460,
 'end': 488,
 'answer': 'bites of infected mosquitoes'}

## abstractive summarization

In [23]:
from transformers import pipeline

pipe = pipeline("summarization", model="Falconsai/medical_summarization")

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [25]:
pipe(summ1)

Your max_length is set to 200, but your input_length is only 180. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)


[{'summary_text': 'yellow fever is a serious viral infection that is spread by certain types of mosquito . yellow fever can’t be passed directly from person to person through close contact . the virus that causes yellow fever has been passed to humans through the bites of infected mosquitoes . between 1999 and 2018, there were 12 cases of yellow fever reported among european travellers . there is no specific treatment for yellow fever , but you will be monitored and the symptoms can be treated .'}]

In [27]:
query = "what are the treatments for brain tumours"
doc_ind = document_selection(query)
summ = textrank_extractive_summarization(query, data.Text.iloc[doc_ind], 10)
data['Disease Name'].iloc[doc_ind]

'Brain tumours'

In [28]:
summ1 = textrank_extractive_summarization_with_bert(query, data.Text.iloc[doc_ind], 10)
summ1

'the main treatment for most brain tumours is surgery, which aims to remove as much of the abnormal tissue as possible.  many others are diagnosed with secondary brain tumours. it’s not always possible to remove the entire tumour, so further treatment with radiotherapy and/orchemotherapymay be necessary to kill any abnormal cells left behind. if a tumour does come back, treatment will aim to relieve your symptoms and prolong life by controlling the growth of the tumour. a brain tumour is a growth of cells in the brain that multiplies in an abnormal, uncontrollable way. for most benign tumours, treatment is often successful and a full recovery is possible, although there’s sometimes a small chance the tumour could come back. more than 9,000 people are diagnosed with primary brain tumours in the uk each year, of which about half are benign and half are malignant. this page gives general information relevant to both types of brain tumour. speak to your gp if you have persistent symptoms o

In [29]:
query = "what are the treatments for brain tumours"
doc_ind = document_selection(query)
summ1 = textrank_extractive_summarization_with_bert(query, data.Text.iloc[doc_ind], 10)
pipe(summ1)

[{'summary_text': 'a brain tumour is a growth of cells in the brain that multiplies in an abnormal, uncontrollable way . for most benign tumours, treatment is often successful and a full recovery is possible , although there’s sometimes a small chance the tumour could come back . more than 9,000 people are diagnosed with primary brain tumours in the uk each year , of which about half are benign and half are malignant . this page gives general information relevant to both types of brain tumour .'}]