# Information Retrieval

In [None]:
# Install the necessary libraries
!pip3 install biopython

In [None]:
# Import the necessary libraries
from Bio import Entrez
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import shutil

In [None]:
# Create the articles directories
!mkdir -p /content/articles

In [None]:
# Utilised functions
def search(query, max_articles, start_date, end_date, email):
  Entrez.email = email

  # Get the PubMed ID of articles according to the query
  handle = Entrez.esearch(db='pubmed', sort='relevance', retmax=max_articles, retmode='xml', term=query, mindate=start_date, maxdate=end_date)
  results = Entrez.read(handle)

  return results

def fetch_details(id, email):
  Entrez.email = email

  # Get the PubMed details of the article
  handle = Entrez.efetch(db='pubmed', retmode='xml', id=id)
  results = Entrez.read(handle)

  return results

def get_text(id):

  # Check if article can be accessed
  url = f'http://www.ncbi.nlm.nih.gov/pmc/articles/pmid/{id}'
  try:
    response = requests.get(url)
  except:
    return

  soup = BeautifulSoup(response.content, features='html.parser')
  
  # Check if the main contents can be scrapped
  div = soup.findAll('p', {'id': re.compile('.*p.*', re.IGNORECASE)})
  if len(div) == 0:
    return
  
  # Scrape the main contents of the article
  text = ''
  for i, tag in enumerate(div):
    sentence = ' '.join(string.strip() for string in tag.strings)
    text += sentence
    if i!=0 and i!=len(div)-1:
      text += '\n\n'
  return text

## ID Retrieval

In [None]:
# Get the PubMed IDs of articles based on the query
results = search('chronic airways disease', max_articles=2000, start_date='2010/01/01', end_date='2020/01/01', email='abc@gmail.com')
id_list = results['IdList']

# Get the title of the PubMed articles
titles = {}
for id in id_list:
  paper = fetch_details(id, email='abc@gmail.com')
  titles[id] = paper['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']

## Text Scrapping

In [None]:
# Scrape the text of the PubMed articles
articles = []
success = []

for id in tqdm(id_list):
  text = get_text(id)
  if text is not None:
    articles.append(text)
    success.append(id)

print(f'\nNumber of articles scrapped: {len(articles)}\n')
for id in success:
    print(f'[{id}]: {titles[id]}')

In [None]:
# Preview a scrapped article
print(articles[0])

In [None]:
# Save each PubMed article as a text file
for i, article in enumerate(articles):
  with open('/content/articles' + f'/article_{success[i]}.txt', 'w') as text_file:
      text_file.write(article)

In [None]:
# Zip the article directory
shutil.make_archive('articles', 'zip', '/content/articles')