# Analysis of the COVID-19 information published by the NEJM

The New England Journal of Medicine (NEJM), is a prestigious medical publication printed by the Massachusetts Medical Association. The journal has made freely available all its content related to the Covid-19 pandemic. This jupyter notebook explains how to:
1. Extract the pdfs of the articles on Covid-19 published by NEJM using the BeautifulSoup library and save them in a local folder;
2. Generate a dataframe with relevant metadata about the articles and save this information in a csv file;
3. Extract the text from each pdf and save it in a file in a local folder;



In [22]:
#imports
import requests
import urllib.request
import os
from os import getcwd, path
import time
import re
from bs4 import BeautifulSoup
import PyPDF2
from PyPDF2 import PdfFileReader
import pandas as pd
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
import glob

In [4]:
url = 'https://www.nejm.org/coronavirus' #url of the NEJM webpage on Covid-19

In [5]:
response = requests.get(url)

In [6]:
response #response 200 indicates a successful request

<Response [200]>

In [7]:
soup = BeautifulSoup(response.text, "html.parser")

In [8]:
links = soup.find_all('a') #I identify all the anchors in the webpage to identify the artciles' pdfs
print("Total Links Found:",links.__len__())

Total Links Found: 1243


In [9]:
#Only a few of the anchors in the webpage correspond to the articles related to Covid-19 by the NEJM

def get_art_num(links):
    art_num_list = []
    for link in links:
        if ('pdf' in link.get('href')):
            my_link = link.get('href')
            art_num = re.search("10.1056/(.+$)",my_link)
            art_num_list.append(art_num.group(0))
    return art_num_list
            

my_articles_doi = get_art_num(links)

In [10]:
my_articles_doi = get_art_num(links)

def delete_doi(my_articles_doi):
    art_list = []
    for art in my_articles_doi:
        art = art.split("/")[1]
        art_list.append(art)
    return art_list
        
my_articles_nodoi = delete_doi(my_articles_doi)
print(len(my_articles_nodoi))

164


In [11]:
url_a = 'https://www.nejm.org/doi/pdf/10.1056/'

def get_pdfs(my_articles_nodoi):
    url_a = 'https://www.nejm.org/doi/pdf/10.1056/'
    arg= ''
    base_dir= [getcwd(), arg][path.isdir(arg)]
   
    for item in my_articles_nodoi:
        url = url_a + item
        r = requests.get(url, stream=True)
        with open (os.path.join(base_dir, "pdfs", item+'.pdf'), 'wb') as pdf:
                    pdf.write(r.content)
          
get_pdfs(my_articles_nodoi)

In [12]:
def get_info(item):
    arg = ""
    base_dir= [getcwd(), arg][path.isdir(arg)]
    with open (os.path.join(base_dir, "pdfs", item+'.pdf'), 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
    return info


def get_nested_dict(my_articles_nodoi):
    nested_dict = {}
    for art in my_articles_nodoi:
        art_info = get_info(art)
        art_dict = {art:art_info}
        nested_dict.update(art_dict)
    return nested_dict



my_info_dict =get_nested_dict(my_articles_nodoi)

print(len(my_info_dict))

    

164


In [14]:
dfObj = pd.DataFrame(my_info_dict).transpose()
df = dfObj.drop(['/Creator', '/ModDate','/Trapped', '/Producer','/Subject'], axis = 1).reset_index()
df1 = df.rename(columns={'index': 'ArtNum',"/CreationDate":"CreationDate", "/Author":"Author","/Title":"Title"})
url_list = ['https://www.nejm.org/doi/pdf/10.1056/'+x for x in my_articles_nodoi]
df1['urls'] = url_list
print(df1)

           ArtNum             CreationDate  \
0    NEJMp2031046  D:20201007172125-04'00'   
1    NEJMp2025955  D:20201029102042-04'00'   
2    NEJMp2027447  D:20201014084636-04'00'   
3    NEJMp2025395  D:20201008123038-04'00'   
4    NEJMp2024834  D:20201013111851-04'00'   
..            ...                      ...   
159  NEJMra032498  D:20031205104806-05'00'   
160  NEJMoa030781        D:20030501202759Z   
161  NEJMoa030747        D:20030506175543Z   
162  NEJMoa030666        D:20030501202837Z   
163  NEJMoa030685        D:20030501202841Z   

                                                Author  \
0                                        Thomas H. Lee   
1    Camila Strassle, E. Jardas, Jorge Ochoa, Benja...   
2                                    Richard E. Leiter   
3                                      Amrapali Maitra   
4                      Jo Shapiro, Timothy B. McDonald   
..                                                 ...   
159  Peiris Joseph S.M., Yuen Kwok Y., Os

In [15]:
art_num = df1['ArtNum'].to_list()


def get_art_type(art_num):
    art_type = []
    for art in art_num:
        if re.match("NEJMp", art):
            art_type.append("Perspective")
        elif re.match("NEJMcibr", art):
            art_type.append("Clinical implications of basic research")
        elif re.match("NEJMcpc", art):
            art_type.append("Case records") 
        elif re.match("NEJMc", art):
            art_type.append("Correspondence")
        elif re.match("NEJMe", art):
            art_type.append("Editorial")
        elif re.match("NEJMsb", art):
            art_type.append("Sounding board")
        elif re.match("NEJMms", art):
            art_type.append("Medicine and society")
        elif re.match("NEJMoa", art):
            art_type.append("Original article")
        elif re.match("NEJMra", art):
            art_type.append("Current concepts")

    return art_type

art_type = get_art_type(art_num)
df1["ArtType"] = art_type

metadata = df1[['ArtNum', 'ArtType', 'CreationDate', 'Author', "Title", "urls"]]

metadata.to_csv(r'metadata.csv')
print(metadata)
        

           ArtNum           ArtType             CreationDate  \
0    NEJMp2031046       Perspective  D:20201007172125-04'00'   
1    NEJMp2025955       Perspective  D:20201029102042-04'00'   
2    NEJMp2027447       Perspective  D:20201014084636-04'00'   
3    NEJMp2025395       Perspective  D:20201008123038-04'00'   
4    NEJMp2024834       Perspective  D:20201013111851-04'00'   
..            ...               ...                      ...   
159  NEJMra032498  Current concepts  D:20031205104806-05'00'   
160  NEJMoa030781  Original article        D:20030501202759Z   
161  NEJMoa030747  Original article        D:20030506175543Z   
162  NEJMoa030666  Original article        D:20030501202837Z   
163  NEJMoa030685  Original article        D:20030501202841Z   

                                                Author  \
0                                        Thomas H. Lee   
1    Camila Strassle, E. Jardas, Jorge Ochoa, Benja...   
2                                    Richard E. Leiter   

In [16]:
! pip install pdfminer.six



In [17]:
my_articles_nodoi = delete_doi(my_articles_doi)

def get_txt(item):
    output_string = StringIO()
    arg = ""
    base_dir= [getcwd(), arg][path.isdir(arg)]
    with open (os.path.join(base_dir, "pdfs", item+".pdf"), 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    return output_string.getvalue()

def txt_file(item, txt):
    arg = ""
    base_dir= [getcwd(), arg][path.isdir(arg)]
    with open (os.path.join(base_dir, "txts", item+".txt"), 'w+') as f:
        f.write(txt)

for art in my_articles_nodoi:
    my_txt = get_txt(art)
    txt_file(art, my_txt)


In [19]:
txt_files = glob.glob("txts/*.txt")


for item in txt_files:
    filename = item.replace(item[:5], '')
    arg = ""
    base_dir= [getcwd(), arg][path.isdir(arg)]
    with open(os.path.join(base_dir, "txts",filename), 'r+') as f:
        text = f.read()
        text = re.sub(r'-\n(\w+ *)', r'\1\n', text)
        text = re.sub('ENGLA ND', 'ENGLAND', text)
        text = re.sub('New England Journal of Medicine', '', text)
        text = re.sub('Downloaded from nejm.org on', '', text)
        text = re.sub('For personal use only.', '', text)
        text = re.sub('No other uses without permission.', '', text)
        text = re.sub('Copyright © 2020 Massachusetts Medical Society.', '', text)
        text = re.sub('All rights reserved.', '', text)
        text = re.sub(r'^n engl j med.*\n?', '', text, flags=re.MULTILINE)
    with open(os.path.join(base_dir, "txts_cleaned",filename), 'w+') as f:
        f.write(text)


for item in txt_files:        
    filename = item.replace(item[:5], '')
    with open(os.path.join(base_dir, "txts_cleaned",filename), "r+") as f:
        lines = f.readlines()

    with open(os.path.join(base_dir, "txts_cleaned",filename), 'w+') as f:
        lines = filter(lambda x: x.strip(), lines)
        f.writelines(lines)   
        



In [20]:
! pip install --upgrade gensim

Requirement already up-to-date: gensim in /Users/giudittaparolini/anaconda3/envs/nejmenv/lib/python3.9/site-packages (3.8.3)


In [21]:
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [28]:
def read_input():
    txt_files = glob.glob("txts_cleaned/*.txt")
    arg = ""
    base_dir= [getcwd(), arg][path.isdir(arg)]
    
    for item in txt_files:
        filename = item.replace(item[:13], '')
        logging.info("reading file {0}...".format(filename))
        with open(os.path.join(base_dir, "txts_cleaned",filename), 'r+') as f:
            text = f.read()
            #logging.info ("read {0} file".format (filename))
            #yield gensim.utils.simple_preprocess(text)
            yield text

documents = list(read_input())
with open(os.path.join(base_dir, "documents.txt"), 'w+') as f:
            f.writelines(str(documents))

print(len(documents))

#logging.info ("Done reading data file")

2020-11-13 15:46:35,476 : INFO : reading file NEJMp2015556.txt...
2020-11-13 15:46:35,477 : INFO : reading file NEJMp2020076.txt...
2020-11-13 15:46:35,479 : INFO : reading file NEJMp2010758.txt...
2020-11-13 15:46:35,480 : INFO : reading file NEJMp2027447.txt...
2020-11-13 15:46:35,482 : INFO : reading file NEJMc2001737.txt...
2020-11-13 15:46:35,483 : INFO : reading file NEJMoa2001191.txt...
2020-11-13 15:46:35,484 : INFO : reading file NEJMp2008300.txt...
2020-11-13 15:46:35,485 : INFO : reading file NEJMp2005630.txt...
2020-11-13 15:46:35,486 : INFO : reading file NEJMp2018846.txt...
2020-11-13 15:46:35,488 : INFO : reading file NEJMcpc059003.txt...
2020-11-13 15:46:35,489 : INFO : reading file NEJMc2013656.txt...
2020-11-13 15:46:35,490 : INFO : reading file NEJMc2017424.txt...
2020-11-13 15:46:35,491 : INFO : reading file NEJMp2005234.txt...
2020-11-13 15:46:35,492 : INFO : reading file NEJMc2018688.txt...
2020-11-13 15:46:35,493 : INFO : reading file NEJMc2009020.txt...
2020-11-

2020-11-13 15:46:35,670 : INFO : reading file NEJMc2014816.txt...
2020-11-13 15:46:35,672 : INFO : reading file NEJMp2005689.txt...
2020-11-13 15:46:35,674 : INFO : reading file NEJMp2000929.txt...
2020-11-13 15:46:35,676 : INFO : reading file NEJMp2002125.txt...
2020-11-13 15:46:35,677 : INFO : reading file NEJMp2003762.txt...
2020-11-13 15:46:35,679 : INFO : reading file NEJMp2025955.txt...
2020-11-13 15:46:35,680 : INFO : reading file NEJMc2001573.txt...
2020-11-13 15:46:35,681 : INFO : reading file NEJMc2007575.txt...
2020-11-13 15:46:35,682 : INFO : reading file NEJMc2019373.txt...
2020-11-13 15:46:35,683 : INFO : reading file NEJMc2001272.txt...
2020-11-13 15:46:35,684 : INFO : reading file NEJMc2010419.txt...
2020-11-13 15:46:35,685 : INFO : reading file NEJMp2005638.txt...
2020-11-13 15:46:35,687 : INFO : reading file NEJMp2012147.txt...
2020-11-13 15:46:35,689 : INFO : reading file NEJMc2010418.txt...
2020-11-13 15:46:35,690 : INFO : reading file NEJMp2022641.txt...
2020-11-13

164


In [25]:
model = gensim.models.Word2Vec (documents, size=300, window=5, min_count=1, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2020-11-13 15:34:47,980 : INFO : collecting all words and their counts
2020-11-13 15:34:47,981 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-11-13 15:34:48,055 : INFO : collected 20945 word types from a corpus of 329770 raw words and 164 sentences
2020-11-13 15:34:48,055 : INFO : Loading a fresh vocabulary
2020-11-13 15:34:48,142 : INFO : effective_min_count=1 retains 20945 unique words (100% of original 20945, drops 0)
2020-11-13 15:34:48,143 : INFO : effective_min_count=1 leaves 329770 word corpus (100% of original 329770, drops 0)
2020-11-13 15:34:48,193 : INFO : deleting the raw counts dictionary of 20945 items
2020-11-13 15:34:48,194 : INFO : sample=0.001 downsamples 33 most-common words
2020-11-13 15:34:48,194 : INFO : downsampling leaves estimated 269226 word corpus (81.6% of prior 329770)
2020-11-13 15:34:48,237 : INFO : estimated required memory for 20945 words and 300 dimensions: 60740500 bytes
2020-11-13 15:34:48,237 : INFO : resetting layer

2020-11-13 15:34:55,318 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-11-13 15:34:55,325 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-13 15:34:55,333 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-13 15:34:55,334 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-13 15:34:55,334 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-13 15:34:55,335 : INFO : EPOCH - 2 : training on 329770 raw words (269372 effective words) took 0.4s, 721854 effective words/s
2020-11-13 15:34:55,592 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-11-13 15:34:55,601 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-11-13 15:34:55,614 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-11-13 15:34:55,619 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-11-13 15:34:55,621 : INFO : worker thread

2020-11-13 15:34:58,138 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-11-13 15:34:58,144 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-11-13 15:34:58,145 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-13 15:34:58,152 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-13 15:34:58,154 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-13 15:34:58,167 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-13 15:34:58,168 : INFO : EPOCH - 10 : training on 329770 raw words (269076 effective words) took 0.4s, 721994 effective words/s
2020-11-13 15:34:58,169 : INFO : training on a 3297700 raw words (2691948 effective words) took 3.6s, 746602 effective words/s


(2691948, 3297700)

In [31]:
w1 = "Covid-19"
model.wv.most_similar (positive=w1)
#model.wv.similarity(w1="coronavirus",w2="sars")

KeyError: "word 'Covid-19' not in vocabulary"