## <b> Data Collection

In [None]:
import requests

r = requests.get('https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=ListRecords&from=2020-03-22&until=2020-06-12&set=bmj&metadataPrefix=pmc')
print(r.text)
fh = open('response.xml', 'w')
fh.write(str(r.text))
fh.close()

<?xml version="1.0" encoding="UTF-8"?><OAI-PMH
	xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2023-05-18T10:56:31Z</responseDate><request verb="ListRecords" from="2020-03-22" metadataPrefix="pmc" set="bmj" until="2020-06-12">https:/www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi</request><ListRecords><record><header><identifier>oai:pubmedcentral.nih.gov:7114954</identifier><datestamp>2020-04-08</datestamp><setSpec>bmj</setSpec><setSpec>pmc-open</setSpec></header><metadata><article xmlns="https://jats.nlm.nih.gov/ns/archiving/1.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="https://jats.nlm.nih.gov/ns/archiving/1.3/ https://jats.nlm.nih.gov/archiving/1.3/xsd/JA

In [1]:
import csv
from lxml import etree as ET
import time

parser = ET.XMLParser()
tree = ET.parse('response.xml', parser)
root = tree.getroot()
# print(root.tag)
namespaces = {'rns': 'http://www.openarchives.org/OAI/2.0/', 'ans':'https://jats.nlm.nih.gov/ns/archiving/1.3/'}
sent_list = []
records = root.findall("rns:ListRecords/rns:record", namespaces)
for record in records:
    sent = " "
    sections = record.findall("rns:metadata/ans:article/ans:body/ans:sec", namespaces)
    para = record.findall("rns:metadata/ans:article/ans:body/ans:p/ans:italic", namespaces)
    for p in para:
        sent += p.text
    for sec in sections:
        if sec.find('ans:sec', namespaces) is True:
            sections2 = sec.findall('ans:sec', namespaces)
            for sec2 in sections2:
                par = sec2.findall('ans:p', namespaces)
                for ps in par:
                    sent += p.text 
        else:
            par = sec.findall('ans:p', namespaces)
            for ps in par:
                sent += ps.text
    sent_list.append(sent)

In [2]:
with open('Articles.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(['Article Content', 'Count in words'])
    for i in range(len(sent_list)):
        writer.writerow([sent_list[i], len(sent_list[i].strip().split(" "))])

In [4]:
import pandas as pd
df = pd.read_csv('Articles.csv')
df.head(10)

Unnamed: 0,Article Content,Count in words
0,China has undergone a rapid epidemiological t...,426
1,"Nicola, 44, goes to her general practitioner ...",437
2,Several groups may be particularly vulnerable...,693
3,Diabetes is a metabolic disorder caused by ge...,408
4,Women with a pregnancy at term are generally ...,645
5,Type 2 diabetes is the leading cause of kidne...,191
6,"Ambient air pollution, especially fine partic...",160
7,Coronaviruses are important pathogens of huma...,297
8,The selective or incomplete reporting of outc...,746
9,News reporting of suicide has increased subst...,348


## <b> TEXT SUMMARIZATION MODEL

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster import cosine_distance
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx
# import re

def read_article(filename): #To read the input file and extract keywords whilst performing basic tokenization
    with open(filename, 'r') as file:
        filedata = file.read()
    # print("Article: \n----------------------------------------------------------------------\n"+ filedata)
    articles = filedata.split('. ')
    sentences = []
    for sentence in articles:
        sentences.append(sentence.replace("[^a-zA-Z]", ' ').split(' ')) #Tokenization of string 
    return sentences

def sent_similarity(sent1, sent2, stopwords = None): #Calculating similarity scores between two sentences using the cosine similarity function 
    if stopwords is None:
        stopwords = []
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    allwords = list(set(sent1+sent2))

    vector1 = [0] * len(allwords)
    vector2 = [0] * len(allwords)

    for w in sent1:
        if w in stopwords:
            continue
        vector1[allwords.index(w)] += 1
        
    for w in sent1:
        if w in stopwords:
            continue
        vector2[allwords.index(w)] += 1
    
    return 1-cosine_distance(vector1, vector2)

def gen_sim_matrix(sentences, stop_words): #Matrix of similarity scores between different sentences in the input file 
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            similarity_matrix[i][j] = sent_similarity(sentences[i], sentences[j])
    return similarity_matrix

def generate_summary(filename, top_n):
    stop_words = stopwords.words('english')
    summarize_text = []
    sentences = read_article(filename)
    sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix) #Converts the matrix into a graph for perfo
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentences = sorted(((scores[i], s)for i,s in enumerate(sentences)), reverse=True)
    # print(ranked_sentences)
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentences[i][1]))
    summarize_text = '. '.join(summarize_text)
    return summarize_text
    # print(f"\nSummary\n--------------------------------\n{'. '.join(summarize_text)}")

In [25]:
summary_text = generate_summary('Sample.txt', 4)
for sent in summary_text.split(". "):
    print(sent)

We chose to focus exclusively on non-cardiac surgery because it is more common
The principal purpose of the review is to inform clinicians wishing to improve their evidence based perioperative care pathways and, by highlighting deficiencies in our evidence base, to facilitate researchers and funders in focusing on areas of greatest need.Our study identified 11 categories of perioperative care interventions that have been tested in randomised trials with the aim of reducing PPCs
RCTs designed to reduce the incidence of PPC generally consider non-cardiac surgery separately from cardiac surgery
Our main finding is that despite a huge literature and the clinical prevalence and importance of the outcome, the existing evidence is of generally poor quality and does not give definitive answers
