In [3]:
from TMfunctions import *
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
nltk.download('stopwords')
import re
import networkx
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from Bio import Entrez
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [5]:
from sklearn.model_selection import KFold

# Separe Titles from Abstracts

In [2]:
train = pd.read_csv('Data/No PreProcessing/train.csv', index_col = 0)
test = pd.read_csv('Data/No PreProcessing/test.csv', index_col = 0)
data = pd.concat([train, test])

In [3]:
def search(query):
    Entrez.email = 'd.giardini2@campus.unimib.it'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='1',
                            retmode='xml',
                            term=query,
                           field='title',
                           api_key='384b5a209a2bb99c452bd22b2f0b2add3f09')
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'd.giardini2@campus.unimib.it'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [4]:
import pickle

In [19]:
errors = []
for index, row in tqdm(data.iterrows()):
    if index in list(diz.keys()):
        continue
    ids = search(row['medical_abstract'].split('.')[0])['IdList']
    if not ids:
        errors.append(index)
        continue
    result = fetch_details(ids)
    try:
        diz[index] = result['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
    except:
        errors.append(index)
        continue
    with open('Data/titles.pkl', 'wb') as f:
        pickle.dump(diz, f)
    time.sleep(0.2)

9445it [3:43:16,  1.42s/it]


In [49]:
with open('Data/errors.pkl', 'wb') as f:
        pickle.dump(errors, f)

In [48]:
len(errors)

71

## Build Dataset

In [50]:
train = pd.read_csv('Data/No PreProcessing/train.csv', index_col = 0)
test = pd.read_csv('Data/No PreProcessing/test.csv', index_col = 0)
data = pd.concat([train, test])

In [64]:
lst = []
for key,value in tqdm(titles.items()):
    diz = {'col_id' : key, 'title' : value}
    tit_abst = data.loc[key, 'medical_abstract']
    abst = tit_abst.split(value)[-1].strip()
    diz['medical_abstract'] = abst
    lst.append(diz)

100%|███████████████████████████████████████████████████████████████████████████| 9403/9403 [00:00<00:00, 48740.90it/s]


In [72]:
df = pd.DataFrame(lst)
df.index = df['col_id']
df = df[['title', 'medical_abstract']]
df.head()

Unnamed: 0_level_0,title,medical_abstract
col_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2839,Immunosuppressive acidic protein in malignant ...,The immunosuppressive acidic protein (IAP) has...
5670,Arterial mechanical properties in dilated card...,The effects of aging on arterial mechanical pr...
6062,Physiology of aging related to outcome in the ...,Thirty-nine patients with adult respiratory di...
5609,The long-distance effects of brain lesions: vi...,We describe several new possibilities for the ...
1924,Randomized phase II evaluation of carboplatin ...,A total of 83 patients with metastatic transit...


In [74]:
df.to_csv('Data/title&abstract.csv')

In [77]:
data = df

# Graph

In [75]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [79]:
stop_words = nltk.corpus.stopwords.words('english')
normalize_corpus = np.vectorize(normalize_document)

vectorizer = TfidfVectorizer(min_df=3, max_df=0.5)
vectorizer.fit(data['medical_abstract'])

TfidfVectorizer(max_df=0.5, min_df=3)

PageRank ranks each node based on a number of in-degree of that node, and inversely proportional to the out-degree of the other nodes that point to that node. In the proposed methodology, a network graph is built from Twitter where the user acts as a node and tweet-retweet relation as a directed edge. The user who retweet the tweet points to the original user who tweets. From the formed graph, each node’s PageRank is calculated as well as other node properties like centrality, degree, and followers, and average time retweeted. The result shows that the PageRank score of a node is directly proportional to closeness centrality and in-degree of the node. However, the ranking with PageRank, closeness centrality, and in-degree ranking yield different ranking results.

In [84]:
def summarize_doc(DOCUMENT, num_sentences=2):
    DOCUMENT = re.sub(r'\n|\r', ' ', DOCUMENT)
    DOCUMENT = re.sub(r' +', ' ', DOCUMENT)
    DOCUMENT = re.sub(r'\d+', ' ', DOCUMENT)
    pattern = r'\[[^\]]*\]'
    DOCUMENT = re.sub(pattern, ' ', DOCUMENT)
    DOCUMENT = DOCUMENT.strip()
    sentences = nltk.sent_tokenize(DOCUMENT)
    norm_sentences = normalize_corpus(sentences)
    norm_sentences[:3]
    dt_matrix = vectorizer.transform(norm_sentences)
    dt_matrix = dt_matrix.toarray()
    similarity_matrix = np.matmul(dt_matrix, dt_matrix.T)
    similarity_graph = networkx.from_numpy_array(similarity_matrix)
    scores = networkx.pagerank(similarity_graph)
    ranked_sentences = sorted(((score, index) for index, score  in scores.items()), reverse=True)
    try:
        top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)]
    except:
        top_sentence_indices = ranked_sentences[0][1]
        return ' '.join(np.array(sentences)[top_sentence_indices])
    top_sentence_indices.sort()
    return ' '.join(np.array(sentences)[top_sentence_indices])

In [85]:
data['summary'] = data['medical_abstract'].apply(summarize_doc)

In [87]:
pd.set_option('display.max_colwidth', None)
data[['medical_abstract', 'summary', 'title']]

Unnamed: 0_level_0,medical_abstract,summary,title
col_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2839,"The immunosuppressive acidic protein (IAP) has been described as a tumor associated marker in some solid tumors and hematologic diseases. To define the clinical relevance the authors determined the serum IAP levels in 194 patients with malignancies before initiation of therapy, 14 patients with idiopathic thrombocytopenic purpura (ITP), 28 patients with bacterial pneumonia, and 23 healthy volunteers. Immunosuppressive acidic protein was measured by radial immunodiffusion. The mean value of our controls was 405 +/- 48 micrograms/ml. This is consistent with published data. The mean values for patients with malignancies varied from 554 micrograms/ml to 698 micrograms/ml. These are only marginally higher than those observed for the controls. In contrast patients with bacterial pneumonia demonstrated significantly elevated values (1038 +/- 261 micrograms/ml). The authors conclude that IAP cannot be used as a diagnostic marker for the malignant diseases examined in this study.",The immunosuppressive acidic protein (IAP) has been described as a tumor associated marker in some solid tumors and hematologic diseases. The mean values for patients with malignancies varied from micrograms/ml to micrograms/ml.,Immunosuppressive acidic protein in malignant diseases. Clinical relevance?
5670,"The effects of aging on arterial mechanical properties and the response to nitroprusside were examined in 25 patients with dilated cardiomyopathy. High-fidelity pressures were recorded with a multisensor catheter. Pulse wave velocity was determined between two sensors in the thoracic aorta. Arterial compliance was determined by an analysis of the diastolic waveform and cardiac output. At baseline, despite a similar systemic vascular resistance, the pulsatile load (e.g., arterial compliance) and wave transmission characteristics (e.g., pulse wave velocity) were altered with aging. Arterial compliance was reduced in older (greater than 50 yr, n = 8) versus younger (less than 35 yr, n = 8) patients (0.51 +/- 0.17 vs. 1.33 +/- 0.63 ml/mmHg, P less than 0.01) and intermediate in those 35-50 yr of age (n = 9, 0.72 +/- 0.40 ml/mmHg). There was a positive correlation between age and pulse wave velocity (r = +0.90). Nitroprusside infusion decreased resistance, increased arterial compliance, and lowered pulse wave velocity in all groups. Yet, advancing age was associated with a greater fall in wave velocity for a given fall in aortic pressure. The slope (K) of the relation between pulse wave velocity and aortic diastolic pressure progressively increased with age (0.01 +/- 0.03, 0.06 +/- 0.02, and 0.09 +/- 0.03 m/s-mmHg). Multiple linear regression analysis revealed a significant relation between K and age. These data demonstrate that in older patients with dilated cardiomyopathy the left ventricle is coupled to an arterial circulation that has a greater pulsatile load, despite a similar steady load. Furthermore, these age-related changes in the arterial system affect the hemodynamic response to pharmacologically-induced vasodilatation.","There was a positive correlation between age and pulse wave velocity (r = + . Nitroprusside infusion decreased resistance, increased arterial compliance, and lowered pulse wave velocity in all groups.",Arterial mechanical properties in dilated cardiomyopathy. Aging and the response to nitroprusside.
6062,"Thirty-nine patients with adult respiratory distress syndrome (ARDS) were enrolled in a study to identify potential age-related changes in organ system function that may help explain the apparent association between age and poor outcome in these patients. Criteria for enrollment included an arterial PO2-to-inspired O2 concentration ratio less than or equal to 200 in a clinical setting consistent with ARDS. Patients were excluded if they were less than 18 yr old, had clinical manifestations of congestive heart failure, were seropositive for the human immunodeficiency virus, or had stage II metastatic lung cancer. Patients were divided into two groups: those less than 60 yr old (mean 42 +/- 3 yr, n = 17) and those greater than or equal to 60 yr old (73 +/- 2 yr, n = 16). A group of six patients was analyzed as a separate subset based on a body temperature less than or equal to 97.5 degrees F at enrollment (hypothermic patients, 73 +/- 4 yr old). Sepsis was present in 67% of the nonhypothermic patients and in all the hypothermic patients. Mortality rates were 12% in the patients less than 60 yr and 69% in the nonhypothermic patients greater than or equal to 60 yr. All the hypothermic patients died. Sequential data obtained over 6 days were compared within and between groups. The following results were obtained. 1) The ratio of arterial PO2 to inspired O2 fraction was greater and the positive end-expiratory pressure used was significantly less in the patients greater than or equal to 60 yr old compared with the younger group.","Patients were divided into two groups: those less than yr old (mean +/- yr, n = ) and those greater than or equal to yr old ( +/- yr, n = ). Mortality rates were % in the patients less than yr and % in the nonhypothermic patients greater than or equal to yr. All the hypothermic patients died.",Physiology of aging related to outcome in the adult respiratory distress syndrome.
5609,"We describe several new possibilities for the study of degenerated myelinated tracts in the human central nervous system (CNS). The methods are based on the visualization of myelin breakdown products that show birefringence in polarized light and, when stained with Nile blue and benzpyrene-3,4, exhibit fluorescence. Even after lengthy formalin fixation, the methods permit the localization of anterogradely degenerated tracts in a variety of fiber systems in the brains of patients who died between five and 20 months after the onset of neurological symptoms. Particularly the polarizing technique, because of its simplicity, can be added to the usual neuropathological methods for demonstrating the long-distance effects of a brain lesion. As research tools, these methods would also aid in the study of the anatomical substrate of human neurological symptomatology.","Even after lengthy formalin fixation, the methods permit the localization of anterogradely degenerated tracts in a variety of fiber systems in the brains of patients who died between five and months after the onset of neurological symptoms. As research tools, these methods would also aid in the study of the anatomical substrate of human neurological symptomatology.",The long-distance effects of brain lesions: visualization of myelinated pathways in the human brain using polarizing and fluorescence microscopy.
1924,"A total of 83 patients with metastatic transitional cell carcinoma who had previously received no systemic therapy entered a randomized phase II evaluation of carboplatin and cis-dichloro-transdihydroxy-bis-isopropylamine platinum IV (CHIP), administered respectively at 400 and 270 mg./m.2 every 28 days. Among evaluable patients with measurable disease response rates were 3 of 22 (14%, 95% confidence interval 5 to 35%) for carboplatin and 4 of 25 (16%, 95% confidence interval 5 to 36%) for CHIP. Among 17 patients with evaluable but not measurable metastases (10 carboplatin and 7 CHIP recipients) there were no responses. Median survival for 64 evaluable patients was 4.8 months (5.0 months for carboplatin and 4.3 months for CHIP recipients). Independent factors prognostic for survival (p less than 0.01) were performance status (0 or 1 versus 2 or 3), liver metastases, prior radiation therapy and recent weight loss (p = 0.02). Multivariate analysis confirmed that a performance status of 2 or 3 and liver metastases were predictive of shorter survival. A total of 31% of the patients treated with carboplatin and 34% of those who received CHIP experienced severe or life-threatening myelosuppression. While the response rates with carboplatin and CHIP are modest, we believe that the characteristics of these agents indicate that they should be evaluated further.","Among evaluable patients with measurable disease response rates were of ( %, % confidence interval to %) for carboplatin and of ( %, % confidence interval to %) for CHIP. Among patients with evaluable but not measurable metastases ( carboplatin and CHIP recipients) there were no responses.",Randomized phase II evaluation of carboplatin and CHIP in advanced transitional cell carcinoma of the urothelium. The Eastern Cooperative Oncology Group.
...,...,...,...
3125,Postoperative sore throat: topical hydrocortisone Forty patients undergoing tracheal intubation and controlled ventilation of the lungs for elective surgical procedures were studied. They were allocated randomly into one of two groups. The tracheal tubes used for group A patients were lubricated before insertion with water-soluble 1% hydrocortisone cream. Those for group B patients were lubricated with KY jelly. The incidence of postoperative sore throat was found to be significantly greater in group A. Topical 1% hydrocortisone cream is therefore ineffective in the prevention of postoperative sore throat.,Postoperative sore throat: topical hydrocortisone Forty patients undergoing tracheal intubation and controlled ventilation of the lungs for elective surgical procedures were studied. The incidence of postoperative sore throat was found to be significantly greater in group A. Topical % hydrocortisone cream is therefore ineffective in the prevention of postoperative sore throat.,Postoperative sore throat: topical hydrocortisone.
6430,"Cancer mortality in workers exposed to 2,3,7,8-tetrachlorodibenzo-p-dioxin BACKGROUND. In both animal and epidemiologic studies, exposure to dioxin (2,3,7,8-tetrachlorodibenzo-p-dioxin, or TCDD) has been associated with an increased risk of cancer. METHODS. We conducted a retrospective cohort study of mortality among the 5172 workers at 12 plants in the United States that produced chemicals contaminated with TCDD. Occupational exposure was documented by reviewing job descriptions and by measuring TCDD in serum from a sample of 253 workers. Causes of death were taken from death certificates. RESULTS. Mortality from several cancers previously associated with TCDD (stomach, liver, and nasal cancers, Hodgkin's disease, and non-Hodgkin's lymphoma) was not significantly elevated in this cohort. Mortality from soft-tissue sarcoma was increased, but not significantly (4 deaths; standardized mortality ratio [SMR], 338; 95 percent confidence interval, 92 to 865). In the subcohort of 1520 workers with greater than or equal to 1 year of exposure and greater than or equal to 20 years of latency, however, mortality was significantly increased for soft-tissue sarcoma (3 deaths; SMR, 922; 95 percent confidence interval, 190 to 2695) and for cancers of the respiratory system (SMR, 142; 95 percent confidence interval, 103 to 192). Mortality from all cancers combined was slightly but significantly elevated in the overall cohort (SMR, 115; 95 percent confidence interval, 102 to 130) and was higher in the subcohort with greater than or equal to 1 year of exposure and greater than or equal to 20 years of latency (SMR, 146; 95 percent confidence interval, 121 to 176). CONCLUSIONS. This study of mortality among workers with occupational exposure to TCDD does not confirm the high relative risks reported for many cancers in previous studies. Conclusions about an increase in the risk of soft-tissue sarcoma are limited by small numbers and misclassification on death certificates. Excess mortality from all cancers combined, cancers of the respiratory tract, and soft-tissue sarcoma may result from exposure to TCDD, although we cannot exclude the possible contribution of factors such as smoking and occupational exposure to other chemicals.","In the subcohort of workers with greater than or equal to year of exposure and greater than or equal to years of latency, however, mortality was significantly increased for soft-tissue sarcoma ( deaths; SMR, ; percent confidence interval, to ) and for cancers of the respiratory system (SMR, ; percent confidence interval, to ). This study of mortality among workers with occupational exposure to TCDD does not confirm the high relative risks reported for many cancers in previous studies.",Mortality from bladder cancer in dyestuff workers exposed to aromatic amines: A 73-year follow-up.
4094,"Reconstruction after mastectomy. Advances in materials and techniques, especially those involving transposition of muscle and skin flaps, have made breast reconstruction possible for most women who undergo mastectomy for breast cancer. The availability of this option can alleviate the breast and chest wall deformity that results from virtually all local treatment of breast cancer. It is essential that the reconstruction surgeon be part of the breast cancer management team from the beginning of treatment planning and that this surgeon work closely with the general surgeon, medical oncologist, and radiation therapist as well as the adjunctive treatment team members. The patient's clinical status and the type of local treatment will be significant determinants of the reconstructive options. For women with stage I breast cancer, these decisions may be based largely on the oncologist's local and adjunctive therapy procedures and the woman's desire to proceed or delay. For women with systemic disease, all members of the breast management team may need to agree on the advisability and timing of reconstruction. Central to all of the numerous decisions described in this paper regarding the timing, type, and extent of breast reconstruction is the primary goal of the entire team: the best possible management of the breast cancer itself. The promise of attractive, symmetric, and natural appearing breasts, complete with a symmetric nipple-areolar complex, has eased somewhat the diminishment of self-esteem and the threat to femininity that can accompany the loss of a breast. By lowering fear, the widely recognized availability of breast reconstruction may encourage more women to monitor their breasts and seek diagnosis of changes and may influence selection of the type of local treatment if cancer is detected. Because of the psychological and cultural significance of the breast, the reconstructive surgeon must be particularly sensitive to the psychological and aesthetic expectations of the patient. Even in those patients with metastases and limited life expectancy, breast reconstruction can enhance the quality of life.","Advances in materials and techniques, especially those involving transposition of muscle and skin flaps, have made breast reconstruction possible for most women who undergo mastectomy for breast cancer. Central to all of the numerous decisions described in this paper regarding the timing, type, and extent of breast reconstruction is the primary goal of the entire team: the best possible management of the breast cancer itself.",Advances in breast reconstruction after mastectomy.
6792,"Headache: a marker of depression Patients who presented with a chief complaint of headache in the outpatient family practice setting were found to have a high prevalence of depression (63%) by the Zung Self-Rating Depression Scale (SDS) index. A statistically significant relationship was found between the frequency of headaches (P = .03) with level of depression. In fact, 74% of patients with headaches recurring almost every day had a clinically significant depression diagnosed as defined by the Zung SDS score. The Zung SDS score also correlated with the length of time that the problem of headache existed (P less than .05). Item analysis of the individual 20-item depression score revealed that four questions accounted for 93% of the variance. This analysis suggests that shorter, more abbreviated screening questions could be developed and refined in the future for use by the busy clinician. Headache is an important marker for depression in the primary care setting. It can be inferred from this study that the clinician may need to focus more on treating the entity of depression than on treating just the symptom of headache.",Headache: a marker of depression Patients who presented with a chief complaint of headache in the outpatient family practice setting were found to have a high prevalence of depression ( %) by the Zung Self-Rating Depression Scale (SDS) index. Headache is an important marker for depression in the primary care setting.,Headache: a marker of depression.


In [89]:
data.to_csv('Data/graphSumm.csv')

In [18]:
df = pd.read_csv('Data/graphSumm.csv')

In [20]:
df[df['summary'] == '']

Unnamed: 0,col_id,title,medical_abstract,summary


## Graph Evaluation

In [95]:
#from evaluate import load
# Load the ROUGE metric
import evaluate
rouge = evaluate.load('rouge')
candidates = list(data['summary'])

references = list(data['title'])
results = rouge.compute(predictions=candidates, references=references)
print(results)

{'rouge1': 0.21552359816270705, 'rouge2': 0.09005551600391185, 'rougeL': 0.17562077093867032, 'rougeLsum': 0.17561396384656563}


# LSA

In [5]:
data = pd.read_csv('Data/title&abstract.csv', index_col = 0)

In [26]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Apply preProcessing
def preprocess(text):
    # Remove punctuation and other non-alphanumeric characters
    text =  re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize the text into words
    words = word_tokenize(text.lower())
    # Remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

for document in data['medical_abstract']:
    no_stop += [preprocess(x) for x in tokenizer.tokenize(document)]

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(no_stop)
X = tfidf_matrix.toarray()

In [30]:
X.shape

(72035, 31816)

In [31]:
Xt = np.transpose(X)

In [32]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(Xt)

len(svd_model.components_)

20

In [33]:
terms = tfidf.get_feature_names_out()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    string = "Topic "+str(i)+": "
    for t in sorted_terms:
        string += t[0].strip()
        string += ", "
    print(string)

Topic 0: hindquarters, cephamycins, stabilisations, nonmedical, lobes, impulse, henseleit, 
Topic 1: baroreceptors, strangulating, photosensitive, italy, angiotoxic, csmc, dermatographism, 
Topic 2: maltreated, pustules, restudied, met, ndga, hindquarters, ending, 
Topic 3: capsulothalamolenticular, meticulous, transposed, unspecified, flareups, strata, strategic, 
Topic 4: capsulothalamolenticular, meticulous, discusses, poliovirus, analgesics, slum, polyreactive, 
Topic 5: avenue, chylomicron, depiction, dissatisfied, engaged, evolving, graves, 
Topic 6: routine, overdrive, hms, csm, neovascularity, preferred, massive, 
Topic 7: doubles, repermeation, contamination, resorting, directors, persistently, granules, 
Topic 8: textural, italy, complainers, nebivolol, crescentic, rflps, turbid, 
Topic 9: ceramic, negotiate, posteriorly, adrenoreceptor, nephrogenesis, cryoglobulinemia, roseola, 
Topic 10: lacunae, leukomalacia, feeders, implied, stilbestrol, chaperone, thyronine, 
Topic 11: 

In [34]:
# Matrice V trasposta
Vt = svd_model.components_

In [35]:
Vt.shape

(20, 72035)

In [36]:
U = svd_model.fit_transform(Xt)

In [37]:
U.shape

(31816, 20)

In [38]:
sigma = np.diag(svd_model.singular_values_)

In [39]:
D = np.matmul(sigma, Vt)

In [40]:
D.shape

(20, 72035)

In [41]:
import pandas as pd

In [42]:
weights = np.power(pd.DataFrame(np.power(D,2)).sum(axis = 'rows'), 1/2)
weights

0        0.134329
1        0.242585
2        0.034525
3        0.277814
4        0.057175
           ...   
72030    0.101549
72031    0.157085
72032    0.182703
72033    0.040380
72034    0.047526
Length: 72035, dtype: float64

In [43]:
sentences = []
for document in data['medical_abstract']:
    sentences += [x for x in tokenizer.tokenize(document)]

In [45]:
df_weights = pd.DataFrame(weights)
df_weights.index = sentences

In [51]:
doc_sent = []
for document in data['medical_abstract']:
    doc_sent.append([x for x in tokenizer.tokenize(document)])

In [53]:
summaries = []
for document in doc_sent:
    best = list(df_weights.loc[document].sort_values(0, ascending = False).iloc[:2].index)
    summary = ' '.join([x for x in document if x in best])
    summaries.append(summary)

In [54]:
summary_df = data.copy()
summary_df['summary'] = summaries

In [55]:
pd.set_option('display.max_colwidth', None)
summary_df.head()

Unnamed: 0_level_0,title,medical_abstract,summary
col_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2839,Immunosuppressive acidic protein in malignant diseases. Clinical relevance?,"The immunosuppressive acidic protein (IAP) has been described as a tumor associated marker in some solid tumors and hematologic diseases. To define the clinical relevance the authors determined the serum IAP levels in 194 patients with malignancies before initiation of therapy, 14 patients with idiopathic thrombocytopenic purpura (ITP), 28 patients with bacterial pneumonia, and 23 healthy volunteers. Immunosuppressive acidic protein was measured by radial immunodiffusion. The mean value of our controls was 405 +/- 48 micrograms/ml. This is consistent with published data. The mean values for patients with malignancies varied from 554 micrograms/ml to 698 micrograms/ml. These are only marginally higher than those observed for the controls. In contrast patients with bacterial pneumonia demonstrated significantly elevated values (1038 +/- 261 micrograms/ml). The authors conclude that IAP cannot be used as a diagnostic marker for the malignant diseases examined in this study.","To define the clinical relevance the authors determined the serum IAP levels in 194 patients with malignancies before initiation of therapy, 14 patients with idiopathic thrombocytopenic purpura (ITP), 28 patients with bacterial pneumonia, and 23 healthy volunteers. The mean value of our controls was 405 +/- 48 micrograms/ml."
5670,Arterial mechanical properties in dilated cardiomyopathy. Aging and the response to nitroprusside.,"The effects of aging on arterial mechanical properties and the response to nitroprusside were examined in 25 patients with dilated cardiomyopathy. High-fidelity pressures were recorded with a multisensor catheter. Pulse wave velocity was determined between two sensors in the thoracic aorta. Arterial compliance was determined by an analysis of the diastolic waveform and cardiac output. At baseline, despite a similar systemic vascular resistance, the pulsatile load (e.g., arterial compliance) and wave transmission characteristics (e.g., pulse wave velocity) were altered with aging. Arterial compliance was reduced in older (greater than 50 yr, n = 8) versus younger (less than 35 yr, n = 8) patients (0.51 +/- 0.17 vs. 1.33 +/- 0.63 ml/mmHg, P less than 0.01) and intermediate in those 35-50 yr of age (n = 9, 0.72 +/- 0.40 ml/mmHg). There was a positive correlation between age and pulse wave velocity (r = +0.90). Nitroprusside infusion decreased resistance, increased arterial compliance, and lowered pulse wave velocity in all groups. Yet, advancing age was associated with a greater fall in wave velocity for a given fall in aortic pressure. The slope (K) of the relation between pulse wave velocity and aortic diastolic pressure progressively increased with age (0.01 +/- 0.03, 0.06 +/- 0.02, and 0.09 +/- 0.03 m/s-mmHg). Multiple linear regression analysis revealed a significant relation between K and age. These data demonstrate that in older patients with dilated cardiomyopathy the left ventricle is coupled to an arterial circulation that has a greater pulsatile load, despite a similar steady load. Furthermore, these age-related changes in the arterial system affect the hemodynamic response to pharmacologically-induced vasodilatation.","Arterial compliance was reduced in older (greater than 50 yr, n = 8) versus younger (less than 35 yr, n = 8) patients (0.51 +/- 0.17 vs. 1.33 +/- 0.63 ml/mmHg, P less than 0.01) and intermediate in those 35-50 yr of age (n = 9, 0.72 +/- 0.40 ml/mmHg). Multiple linear regression analysis revealed a significant relation between K and age."
6062,Physiology of aging related to outcome in the adult respiratory distress syndrome.,"Thirty-nine patients with adult respiratory distress syndrome (ARDS) were enrolled in a study to identify potential age-related changes in organ system function that may help explain the apparent association between age and poor outcome in these patients. Criteria for enrollment included an arterial PO2-to-inspired O2 concentration ratio less than or equal to 200 in a clinical setting consistent with ARDS. Patients were excluded if they were less than 18 yr old, had clinical manifestations of congestive heart failure, were seropositive for the human immunodeficiency virus, or had stage II metastatic lung cancer. Patients were divided into two groups: those less than 60 yr old (mean 42 +/- 3 yr, n = 17) and those greater than or equal to 60 yr old (73 +/- 2 yr, n = 16). A group of six patients was analyzed as a separate subset based on a body temperature less than or equal to 97.5 degrees F at enrollment (hypothermic patients, 73 +/- 4 yr old). Sepsis was present in 67% of the nonhypothermic patients and in all the hypothermic patients. Mortality rates were 12% in the patients less than 60 yr and 69% in the nonhypothermic patients greater than or equal to 60 yr. All the hypothermic patients died. Sequential data obtained over 6 days were compared within and between groups. The following results were obtained. 1) The ratio of arterial PO2 to inspired O2 fraction was greater and the positive end-expiratory pressure used was significantly less in the patients greater than or equal to 60 yr old compared with the younger group.",Mortality rates were 12% in the patients less than 60 yr and 69% in the nonhypothermic patients greater than or equal to 60 yr. All the hypothermic patients died. The following results were obtained.
5609,The long-distance effects of brain lesions: visualization of myelinated pathways in the human brain using polarizing and fluorescence microscopy.,"We describe several new possibilities for the study of degenerated myelinated tracts in the human central nervous system (CNS). The methods are based on the visualization of myelin breakdown products that show birefringence in polarized light and, when stained with Nile blue and benzpyrene-3,4, exhibit fluorescence. Even after lengthy formalin fixation, the methods permit the localization of anterogradely degenerated tracts in a variety of fiber systems in the brains of patients who died between five and 20 months after the onset of neurological symptoms. Particularly the polarizing technique, because of its simplicity, can be added to the usual neuropathological methods for demonstrating the long-distance effects of a brain lesion. As research tools, these methods would also aid in the study of the anatomical substrate of human neurological symptomatology.","Even after lengthy formalin fixation, the methods permit the localization of anterogradely degenerated tracts in a variety of fiber systems in the brains of patients who died between five and 20 months after the onset of neurological symptoms. As research tools, these methods would also aid in the study of the anatomical substrate of human neurological symptomatology."
1924,Randomized phase II evaluation of carboplatin and CHIP in advanced transitional cell carcinoma of the urothelium. The Eastern Cooperative Oncology Group.,"A total of 83 patients with metastatic transitional cell carcinoma who had previously received no systemic therapy entered a randomized phase II evaluation of carboplatin and cis-dichloro-transdihydroxy-bis-isopropylamine platinum IV (CHIP), administered respectively at 400 and 270 mg./m.2 every 28 days. Among evaluable patients with measurable disease response rates were 3 of 22 (14%, 95% confidence interval 5 to 35%) for carboplatin and 4 of 25 (16%, 95% confidence interval 5 to 36%) for CHIP. Among 17 patients with evaluable but not measurable metastases (10 carboplatin and 7 CHIP recipients) there were no responses. Median survival for 64 evaluable patients was 4.8 months (5.0 months for carboplatin and 4.3 months for CHIP recipients). Independent factors prognostic for survival (p less than 0.01) were performance status (0 or 1 versus 2 or 3), liver metastases, prior radiation therapy and recent weight loss (p = 0.02). Multivariate analysis confirmed that a performance status of 2 or 3 and liver metastases were predictive of shorter survival. A total of 31% of the patients treated with carboplatin and 34% of those who received CHIP experienced severe or life-threatening myelosuppression. While the response rates with carboplatin and CHIP are modest, we believe that the characteristics of these agents indicate that they should be evaluated further.","Median survival for 64 evaluable patients was 4.8 months (5.0 months for carboplatin and 4.3 months for CHIP recipients). Independent factors prognostic for survival (p less than 0.01) were performance status (0 or 1 versus 2 or 3), liver metastases, prior radiation therapy and recent weight loss (p = 0.02)."


In [56]:
summary_df.to_csv('Data/LSASumm.csv')

## LSA Evaluation

In [97]:
data = pd.read_csv('Data/LSASumm.csv')

In [98]:
#from evaluate import load
# Load the ROUGE metric
import evaluate
rouge = evaluate.load('rouge')
candidates = list(data['summary'])

references = list(data['title'])
results = rouge.compute(predictions=candidates, references=references)
print(results)

{'rouge1': 0.17958395781861622, 'rouge2': 0.06491358027816224, 'rougeL': 0.14147401955939062, 'rougeLsum': 0.14156975537796002}


# Casual Dataset

In [22]:
data = pd.read_csv('Data/title&abstract.csv', index_col = 0)

In [23]:
import random

In [24]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

rnd = []
for document in data['medical_abstract']:
    sentences = tokenizer.tokenize(document)
    if len(sentences) > 2:
        rnd.append(' '.join(random.sample(sentences, 2)))
    else:
        rnd.append(' '.join(sentences))

In [25]:
data['random_summary'] = rnd

## Random Evaluation

In [27]:
import evaluate
rouge = evaluate.load('rouge')
candidates = list(data['random_summary'])

references = list(data['title'])
results = rouge.compute(predictions=candidates, references=references)
print(results)

{'rouge1': 0.19889511104216673, 'rouge2': 0.07535874210351032, 'rougeL': 0.15696185120724238, 'rougeLsum': 0.15697829987334586}


In [28]:
data

Unnamed: 0_level_0,title,medical_abstract,random_summary
col_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2839,Immunosuppressive acidic protein in malignant ...,The immunosuppressive acidic protein (IAP) has...,These are only marginally higher than those ob...
5670,Arterial mechanical properties in dilated card...,The effects of aging on arterial mechanical pr...,"Yet, advancing age was associated with a great..."
6062,Physiology of aging related to outcome in the ...,Thirty-nine patients with adult respiratory di...,Criteria for enrollment included an arterial P...
5609,The long-distance effects of brain lesions: vi...,We describe several new possibilities for the ...,The methods are based on the visualization of ...
1924,Randomized phase II evaluation of carboplatin ...,A total of 83 patients with metastatic transit...,Independent factors prognostic for survival (p...
...,...,...,...
3125,Postoperative sore throat: topical hydrocortis...,Postoperative sore throat: topical hydrocortis...,They were allocated randomly into one of two g...
6430,Mortality from bladder cancer in dyestuff work...,"Cancer mortality in workers exposed to 2,3,7,8...",In the subcohort of 1520 workers with greater ...
4094,Advances in breast reconstruction after mastec...,Reconstruction after mastectomy. Advances in m...,Because of the psychological and cultural sign...
6792,Headache: a marker of depression.,Headache: a marker of depression Patients who ...,It can be inferred from this study that the cl...


In [30]:
graph_df = pd.read_csv('Data/GraphSumm.csv', index_col = 0)

In [33]:
data = data.join(graph_df['summary'], how='left', on=None, sort=False)

In [36]:
data = data.rename(columns={"summary": "graph_summary"})

In [37]:
lsa_df = pd.read_csv('Data/LSASumm.csv', index_col = 0)
data = data.join(lsa_df['summary'], how='left', on=None, sort=False)
data = data.rename(columns={"summary": "lsa_summary"})

In [38]:
data.to_csv('Data/summaries.csv')

# Evaluation with Classification

In [5]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('Pretrained Models/PubMed-and-PMC-w2v.bin', binary=True)

  "class": algorithms.Blowfish,


In [6]:
df_summaries = pd.read_csv('Data/summaries.csv', index_col = 0)

In [14]:
train = pd.read_csv('Data/No PreProcessing/train.csv', index_col = 0)
test = pd.read_csv('Data/No PreProcessing/test.csv', index_col = 0)
data = pd.concat([train,test])

In [17]:
data = pd.merge(data[['1','2','3','4','5']], df_summaries, left_index=True, right_index=True)
#data = df_summaries.join(data[['1','2','3','4','5']], how='left', on=None, sort=False)
data.head()

Unnamed: 0,1,2,3,4,5,title,medical_abstract,random_summary,graph_summary,lsa_summary
2839,1,0,0,0,0,Immunosuppressive acidic protein in malignant ...,The immunosuppressive acidic protein (IAP) has...,These are only marginally higher than those ob...,The immunosuppressive acidic protein (IAP) has...,To define the clinical relevance the authors d...
5670,0,0,0,1,0,Arterial mechanical properties in dilated card...,The effects of aging on arterial mechanical pr...,"Yet, advancing age was associated with a great...",There was a positive correlation between age a...,Arterial compliance was reduced in older (grea...
6062,0,0,0,0,1,Physiology of aging related to outcome in the ...,Thirty-nine patients with adult respiratory di...,Criteria for enrollment included an arterial P...,Patients were divided into two groups: those l...,Mortality rates were 12% in the patients less ...
5609,0,0,1,0,1,The long-distance effects of brain lesions: vi...,We describe several new possibilities for the ...,The methods are based on the visualization of ...,"Even after lengthy formalin fixation, the meth...","Even after lengthy formalin fixation, the meth..."
1924,1,0,0,0,0,Randomized phase II evaluation of carboplatin ...,A total of 83 patients with metastatic transit...,Independent factors prognostic for survival (p...,Among evaluable patients with measurable disea...,Median survival for 64 evaluable patients was ...


In [19]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [model[word] for word in words if word in model]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [20]:
# Preprocess summary columns
def preprocess(text):
    # Remove punctuation and other non-alphanumeric characters
    text =  re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize the text into words
    words = word_tokenize(text.lower())
    # Remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # Join the words back into a string
    return ' '.join(words)

data['random_summary'] = data['random_summary'].apply(lambda x: preprocess(x))
data['graph_summary'] = data['graph_summary'].apply(lambda x: preprocess(x))
data['lsa_summary'] = data['lsa_summary'].apply(lambda x: preprocess(x))

In [22]:
data = data.drop(data[data['random_summary'] == ''].index)

In [23]:
data.to_csv('Data/summariesPreprocessed.csv')

In [24]:
kf = KFold(n_splits=5)

fold = 0
Kresults = []

for train_index, test_index in kf.split(data):
    fold += 1
    train = data.iloc[train_index]
    test = data.iloc[test_index]
    
    FE_train = pd.DataFrame(np.array([vectorize(doc) for doc in train['random_summary']]))
    FE_test = pd.DataFrame(np.array([vectorize(doc) for doc in test['random_summary']]))
    
    Kresult = micro_f1SVM(build_resultsSVM(FE_train, FE_test, train, test))
    Kresults.append(Kresult)
    print("fold " + str(fold) + ":")
    print(Kresult)
    print(" ")
    
    DT, RF, NB, SVM = 0, 0, 0, 0
    for result in Kresults:
        DT += result['DT']
        RF += result['RF']
        NB += result['NB']
        SVM += result['SVM']
    result = {'DT': DT/5, 'RF': RF/5, 'NB': NB/5, 'SVM': SVM/5}
result

fold 1:
{'DT': 0.451, 'RF': 0.516, 'NB': 0.626, 'SVM': 0.638}
 
fold 2:
{'DT': 0.476, 'RF': 0.517, 'NB': 0.604, 'SVM': 0.621}
 
fold 3:
{'DT': 0.478, 'RF': 0.511, 'NB': 0.614, 'SVM': 0.635}
 
fold 4:
{'DT': 0.472, 'RF': 0.515, 'NB': 0.629, 'SVM': 0.638}
 
fold 5:
{'DT': 0.472, 'RF': 0.505, 'NB': 0.625, 'SVM': 0.632}
 


{'DT': 0.46980000000000005,
 'RF': 0.5128,
 'NB': 0.6195999999999999,
 'SVM': 0.6328}

In [27]:
kf = KFold(n_splits=5)

fold = 0
Kresults = []

for train_index, test_index in kf.split(data):
    fold += 1
    train = data.iloc[train_index]
    test = data.iloc[test_index]
    
    FE_train = pd.DataFrame(np.array([vectorize(doc) for doc in train['graph_summary']]))
    FE_test = pd.DataFrame(np.array([vectorize(doc) for doc in test['graph_summary']]))
    
    Kresult = micro_f1SVM(build_resultsSVM(FE_train, FE_test, train, test))
    Kresults.append(Kresult)
    print("fold " + str(fold) + ":")
    print(Kresult)
    print(" ")
    
    DT, RF, NB, SVM = 0, 0, 0, 0
    for result in Kresults:
        DT += result['DT']
        RF += result['RF']
        NB += result['NB']
        SVM += result['SVM']
    result = {'DT': DT/5, 'RF': RF/5, 'NB': NB/5, 'SVM': SVM/5}
result

fold 1:
{'DT': 0.495, 'RF': 0.528, 'NB': 0.615, 'SVM': 0.651}
 
fold 2:
{'DT': 0.484, 'RF': 0.523, 'NB': 0.609, 'SVM': 0.647}
 
fold 3:
{'DT': 0.499, 'RF': 0.538, 'NB': 0.621, 'SVM': 0.653}
 
fold 4:
{'DT': 0.491, 'RF': 0.524, 'NB': 0.617, 'SVM': 0.639}
 
fold 5:
{'DT': 0.483, 'RF': 0.523, 'NB': 0.637, 'SVM': 0.64}
 


{'DT': 0.4904,
 'RF': 0.5272000000000001,
 'NB': 0.6197999999999999,
 'SVM': 0.646}

In [28]:
kf = KFold(n_splits=5)

fold = 0
Kresults = []

for train_index, test_index in kf.split(data):
    fold += 1
    train = data.iloc[train_index]
    test = data.iloc[test_index]
    
    FE_train = pd.DataFrame(np.array([vectorize(doc) for doc in train['lsa_summary']]))
    FE_test = pd.DataFrame(np.array([vectorize(doc) for doc in test['lsa_summary']]))
    
    Kresult = micro_f1SVM(build_resultsSVM(FE_train, FE_test, train, test))
    Kresults.append(Kresult)
    print("fold " + str(fold) + ":")
    print(Kresult)
    print(" ")
    
    DT, RF, NB, SVM = 0, 0, 0, 0
    for result in Kresults:
        DT += result['DT']
        RF += result['RF']
        NB += result['NB']
        SVM += result['SVM']
    result = {'DT': DT/5, 'RF': RF/5, 'NB': NB/5, 'SVM': SVM/5}
result

fold 1:
{'DT': 0.449, 'RF': 0.515, 'NB': 0.596, 'SVM': 0.626}
 
fold 2:
{'DT': 0.485, 'RF': 0.505, 'NB': 0.586, 'SVM': 0.622}
 
fold 3:
{'DT': 0.473, 'RF': 0.519, 'NB': 0.597, 'SVM': 0.643}
 
fold 4:
{'DT': 0.482, 'RF': 0.511, 'NB': 0.588, 'SVM': 0.63}
 
fold 5:
{'DT': 0.467, 'RF': 0.507, 'NB': 0.604, 'SVM': 0.62}
 


{'DT': 0.47119999999999995,
 'RF': 0.5114000000000001,
 'NB': 0.5942000000000001,
 'SVM': 0.6282}