In [1]:
# Imports and so on ...

from google.colab import drive
drive.mount('/content/drive')
!pip install metapub
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) # so it prints all the dataframe columns ...
from metapub import PubMedFetcher

Mounted at /content/drive
Collecting metapub
  Downloading metapub-0.5.12-py2.py3-none-any.whl.metadata (16 kB)
Collecting lxml-html-clean (from metapub)
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Collecting eutils (from metapub)
  Downloading eutils-0.6.0-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting habanero (from metapub)
  Downloading habanero-2.2.0-py3-none-any.whl.metadata (8.0 kB)
Collecting cssselect (from metapub)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting unidecode (from metapub)
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting docopt (from metapub)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting coloredlogs (from metapub)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-Levenshtein (from metapub)
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting huma



In [2]:
# HERE YOU CAN GIVE ANY ARRAY OF PMIDS

pmids = list(pd.read_csv('/content/drive/MyDrive/changed_CD012768.csv', index_col=0)["PMID"].to_numpy())
pmids

[22815718,
 27286562,
 22381459,
 21396219,
 24167451,
 27366339,
 26311855,
 26071438,
 25960097]

In [3]:
# Putting all the features that PubMed returns into a pandas dataframe ...

from metapub import PubMedFetcher
import json
fetch = PubMedFetcher()

def hasmethod(obj, name): # the functions returned should be stored separately ... may be used to return something that is not in the other properties later on ...
    return hasattr(obj, name) and ( "method" in str(type(getattr(obj, name))) )

all_article_data=[]
column_names=[]
function_names=[]
for i in range( 0, len( pmids ) ):
  article = fetch.article_by_pmid(pmids[i])
  article_data = {}
  for attr in dir( article ):
    if ( i == 0 ):
      if ( not hasmethod( article, str( attr ) ) ):
          column_names.append( attr )
      else:
          function_names.append( attr )
    if ( not hasmethod( article, str( attr ) ) ):
      article_data[ attr ] = getattr(article, attr)
  all_article_data.append( article_data )

df = pd.DataFrame( all_article_data, columns=column_names )

In [4]:
# Dropping all columns that only have NaN or fully identical values ...

count_row = df.shape[0]  # Gives number of rows
freq_df=df.describe().loc[['freq']]
count_df=df.describe().loc[['count']]
for column in freq_df:
    if ( freq_df[column].iloc[0] == count_df[column].iloc[0] ):
      df = df.drop(column, axis=1)
    if ( pd.isna(freq_df[column].iloc[0]) ):
      df = df.drop(column, axis=1)

In [5]:
# This whole thing was a xml to json conversion that barely helped at all ... better to just parse the xml ...

# import xml.etree.ElementTree as ET
# all_json_xml_tags=[]

# def extract_row_xml_tags(row):
#   xml_tree = ET.ElementTree(ET.fromstring(row["xml"]))
#   all_xml_tags = []
#   for elem in xml_tree.iter():
#       all_xml_tags.append(elem.tag)
#   all_xml_tags = list(set(all_xml_tags))
#   # print(all_xml_tags)

#   xml_tag_json={}
#   for elem in all_xml_tags:
#     final_arr=[]
#     arr = xml_tree.findall(".//"+elem)
#     for i in arr:
#       # print( ET.tostring( i, encoding='unicode' ) )
#       final_arr.append( ET.tostring(i, encoding='unicode') )
#     xml_tag_json[ elem ] = final_arr

#   json_string = json.dumps( xml_tag_json )
#   return json_string

# df["json_xml"] = df.apply(extract_row_xml_tags, axis=1)

# import json
# print( json.dumps(df.iloc[0]["json_xml"], indent=4) )

In [6]:
# This reveals that the content feature is contained in the xml feature (they are both xml representations of the same thing)

# from lxml import etree
# content=df.iloc[0]["content"]
# etree.tostring(content, pretty_print=True)

In [7]:
# Removing more useless columns and adding some new ones to better describe the data ...

remove_columns=[ "__dict__", "pii", "author1_lastfm", "author_list", "authors_str", "author1_last_fm", "content", "citation_html", "pages", "first_page", "last_page" ]
# pcim / doi are redundant as they are found in the urls too ... yet they were not removed for now ...
# pii contains the issn and other things ... no way to retrieve article though "Publisher Item Identifier", unless you go to the actual publisher's website ...

if "author1_last_fm" in df.columns:
  df["author_first"]=df["author1_last_fm"]

# if "pmc" in df.columns: # so as to parse the webpage like with the pubmed site ...
#   df['pmc_url']="https://pmc.ncbi.nlm.nih.gov/articles/PMC"+str(df['pmc'])+"/"




# Uncomment this to keep the doi in an url ... yet, each doi url points to a different 2nd domain, which is NOT the same for all articles, meaning that it is not easy to get data from it (website structures differ) ...

# if "doi" in df.columns: # so as to parse the webpage like with the pubmed site ...
#   df["doi_url"]="https://doi.org/"+df["doi"]


for i in remove_columns:
  if i in df.columns:
    df = df.drop(i, axis=1)
df.describe()


# Choose one of the following methods for formatting xml in a visible way ... Quite sure the first one has no bugs ...

# import xml.dom.minidom
# def restyle_xml(row):
#   return xml.dom.minidom.parseString(row["xml"]).toprettyxml()
# df['xml'] = df.apply(restyle_xml, axis=1)
# print( df.iloc[0]["xml"] )

# from lxml import etree
# def restyle_xml(row):
#   x = etree.fromstring(row["xml"])
#   return etree.tostring(x, pretty_print=True)
# df['xml'] = df.apply(restyle_xml, axis=1)

Unnamed: 0,abstract,authors,chemicals,citation,doi,history,issn,issue,journal,keywords,mesh,pmc,pmid,publication_types,title,url,volume,volume_issue,xml,year,author_first
count,9,9,9,9,9,9,9,7,9,9,9,6,9,9,9,9,9,9,9,9,9
unique,9,9,5,9,9,9,9,5,9,2,9,6,9,9,9,9,9,9,9,6,9
top,BACKGROUND: Hospitals in sub-Saharan Africa ar...,"[Peter JG, Theron G, Muchinga TE, Govender U, ...",{},"Peter JG, et al. The diagnostic accuracy of ur...",10.1371/journal.pone.0039966,"{'received': 2012-03-08 00:00:00, 'accepted': ...",1932-6203,11,PLoS One,[],{'D017088': {'descriptor_name': 'AIDS-Related ...,3392260,22815718,"{'D016430': 'Clinical Trial', 'D016428': 'Jour...",The diagnostic accuracy of urine-based Xpert M...,https://ncbi.nlm.nih.gov/pubmed/22815718,7,7(7),"b'<?xml version=""1.0"" ?>\n<!DOCTYPE PubmedArti...",2012,Peter JG
freq,1,1,5,1,1,1,1,2,1,8,1,1,1,1,1,1,1,1,1,2,1


In [8]:
# Getting references with ids from the "xml" property that PubMed returned ...

import xml.etree.ElementTree as ET

def get_references(row):
  xml_tree = ET.ElementTree(ET.fromstring(row["xml"]))
  have_citation=0
  for elem in xml_tree.iter():
    if ( "'ReferenceList'" in str( elem )  ):
      have_citation=1
      break

  # <Reference>
  # 	<Citation>REFERENCE ARTICLE TITLE</Citation>
  # 	<ArticleIdList>
  # 		<ArticleId IdType="pmc">PMC ID</ArticleId>
  # 		<ArticleId IdType="pubmed">PUBMED ID</ArticleId>
  # 	</ArticleIdList>
  # </Reference>

  all_citations=[]
  if ( have_citation == 1 ):
    ref_list=xml_tree.find(".//ReferenceList")
    refs=ref_list.findall(".//Reference")
    for ref in refs:
      citation_json={}
      for citation in ref.findall( ".//Citation" ):
        citation_json[ "citation" ] = citation.text
      article_id_list=ref.find( ".//ArticleIdList" )

      article_id_json={}
      if ( article_id_list != None ):
        for article_id in article_id_list.findall( ".//ArticleId" ):
          article_id_json[ article_id.attrib['IdType'] ] = article_id.text

      citation_json[ "article_ids" ] = article_id_json
      all_citations.append( citation_json )

  return json.dumps( all_citations )

df['references'] = df.apply(get_references, axis=1)

In [9]:
# Here, since most pmids do NOT have any keywords, some are generated for each of them ...

# This is done by:
#   0. removing all the common words from the title (done using lists of common words from wiki)
#   1. going though all the keywords that were found (from the articles that had them), to make a list of non-generated keywords/phrases
#   2. adding non-generated keywords/phrases to an article without keywords if they appear in its title
#   3. adding the final non-common title words to the keywords (the ones that remained = were not common and were not in non-generated keywords/phrases)
# (for everything to function properly everything has to be lowercase)

# All the common words are saved to a csv file, for easy access ...

from pathlib import Path
csv_file = Path("/content/drive/MyDrive/stop_words.csv")

if csv_file.is_file():
  print( "Stop words retrieved from \"stop_words.csv\"!" )
  stop_words = list(pd.read_csv(str(csv_file.resolve()))["word"].to_numpy())
else:
  print( "Stop words were re-generated and saved to \"stop_words.csv\"!" )
  import nltk
  nltk.download('stopwords')
  stop_words = list(stopwords.words("english"))

  print( "length before wiki: " + str( len(stop_words) ) )
  import urllib.request
  from bs4 import BeautifulSoup

  fp = urllib.request.urlopen("https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/English/Wikipedia_(2016)")
  mybytes = fp.read()
  html_page = mybytes.decode("utf8")
  fp.close()
  soup = BeautifulSoup(html_page)
  all_p = soup.find_all("p")
  word_p = []
  for p in all_p:
    if ( len( p.find_all() ) > 100 ):
      word_p.append( p )

  for p in word_p:
    word_a = list( p.find_all() );
    # print( word_a )
    for a in word_a:
        stop_words.append( a.text.lower() )

  print( "length after wiki 1: " + str( len(stop_words) ) )
  stop_words = list(dict.fromkeys(stop_words)) # removing duplicate words
  print( "length after wiki 1 (duplicates removed): " + str( len(stop_words) ) )
  fp = urllib.request.urlopen("https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2005/10/1-10000")
  mybytes = fp.read()
  html_page = mybytes.decode("utf8")
  fp.close()
  soup = BeautifulSoup(html_page)
  all_p = soup.find_all("p")
  word_p = []
  for p in all_p:
    if ( len( p.find_all() ) > 100 ):
      word_p.append( p )

  for p in word_p:
    word_a = list( p.find_all("a"));
    # print( word_a )
    for a in word_a:
      stop_words.append( a["title"].lower() )

  print( "length after wiki 2: " + str( len(stop_words) ) )
  stop_words = list(dict.fromkeys(stop_words)) # removing duplicate words
  print( "length after wiki 2 (duplicates removed): " + str( len(stop_words) ) )

  def add_more_stop_words (page):
    global stop_words
    fp = urllib.request.urlopen("https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/"+page)
    mybytes = fp.read()
    html_page = mybytes.decode("utf8")
    fp.close()
    soup = BeautifulSoup(html_page)
    all_td = soup.find_all("td")
    for td in all_td:
      if ( td.find("a") != None ):
        stop_words.append( td.find("a")["title"].lower() )
    stop_words = list(dict.fromkeys(stop_words)) # removing duplicate words


  for page_iter in range( 0, 10000, 1000 ):
    # print( str( page_iter + 1 ) + "-" + str( page_iter + 1000 ) )
    add_more_stop_words( str( page_iter + 1 ) + "-" + str( page_iter + 1000 ) )
  for page_iter in range( 10000, 20000, 2000 ):
    # print( str( page_iter + 1 ) + "-" + str( page_iter + 2000 ) )
    add_more_stop_words( str( page_iter + 1 ) + "-" + str( page_iter + 2000 ) )
  add_more_stop_words( "40001-41284" )


  print( "length after wiki 3: " + str( len(stop_words) ) )
  stop_words = list(dict.fromkeys(stop_words)) # removing duplicate words
  print( "length after wiki 3 (duplicates removed): " + str( len(stop_words) ) )

  stop_words_df = pd.DataFrame( stop_words, columns=["word"] )
  stop_words_df.to_csv("/content/drive/MyDrive/stop_words.csv", encoding='utf-8', index=False)



def get_keywords(sentence):
  words = sentence.lower().split()
  filtered_words = [word for word in words if word not in stop_words]
  uncommon_words = ' '.join(filtered_words)

  valid_characters = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ\- /_0123456789')
  # as such: "!"#$%&'()*+,.:;<=>?@[\\]^`{|}~" are now allowed of keyword names

  final_string = ''.join(c for c in uncommon_words if c in valid_characters)
  return final_string



count_empty=0
count_non_empty=0
non_generated_keywords=[]
def determine_keywords(row):
  global non_generated_keywords
  if ( row["keywords"] != [] ):
    row["keywords"]=[x.lower() for x in row["keywords"]]
    global count_non_empty
    count_non_empty = count_non_empty + 1
    non_generated_keywords.extend(row["keywords"])
    # print( row["title"] )
    # print( str( row["keywords"] ) + "\n\n" )
  non_generated_keywords = sorted(non_generated_keywords, key=len, reverse=True) # sorting by length
  non_generated_keywords = list(dict.fromkeys(non_generated_keywords)) # removing duplicate keywords



def generate_keywords(row):
  if ( row["keywords"] == [] ):
    global count_empty
    global non_generated_keywords
    count_empty = count_empty + 1
    # print( row["title"] )
    keywords=[]
    generated_keywords = get_keywords( row["title"] )
    for i in range(0, len( non_generated_keywords ) ):
      if ( non_generated_keywords[i] in generated_keywords ):
        generated_keywords = generated_keywords.replace( non_generated_keywords[i], "" )
        keywords.append( non_generated_keywords[i] )
    keywords.extend( generated_keywords.split( ' ' ) )
    keywords=[x.lower() for x in keywords]
    valid_characters = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ\\- /_0123456789')
    # as such: "!"#$%&'()*+,.:;<=>?@[\\]^`{|}~" are now allowed of keyword names
    for keyword in keywords:
      keyword = ''.join(c for c in keyword if c in valid_characters)

    for keyword in keywords:
      for i in range(0, len(keyword)):
        if keyword[i] in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789":
          break
        if keyword[i] in "\\- /_":
          keywords.remove(keyword)
          keywords.append(keyword[1:])
      for i in range(len(keyword)-1, -1, -1):
        if keyword[i] in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789":
          break
        if keyword[i] in "\\- /_":
          keywords.remove(keyword)
          keywords.append(keyword[1:])

    while '' in keywords:
        keywords.remove('')

    return str( keywords )
  else:
    return str( row["keywords"] )


df.apply(determine_keywords, axis=1)
df["keywords"]=df.apply(generate_keywords, axis=1)
print( "Papers with NO keywords: " + str(count_empty) + "/" + str(df.shape[0]) ) # most were empty
print( "Papers with keywords: " + str(count_non_empty) + "/" + str(df.shape[0]) )

Stop words retrieved from "stop_words.csv"!
Papers with NO keywords: 8/9
Papers with keywords: 1/9


In [10]:
# Getting the article language and institution (only way, for the language at least) ...

import urllib.request
from bs4 import BeautifulSoup

def get_page(url):
  fp = urllib.request.urlopen(url)
  mybytes = fp.read()
  html_page = mybytes.decode("utf8")
  fp.close()
  soup = BeautifulSoup(html_page)
  return soup

def get_institution(row):
  soup = get_page( row[ "url" ] )
  # Here extracting paragraphs from the abstract ... yet maybe not all of them have paragraphs

  # for i in soup.find("div", {"id": "abstract"}).find_all("strong", {"class": "sub-title"}):
  #   p = i.parent
  #   i.extract()
  #   print( "\"" + str( p.text ).strip() + "\"" )
  return soup.find("meta", {"name": "citation_author_institution"})['content']

def get_language(row):
  soup = get_page( row[ "url" ] )
  return soup.find("meta", {"name": "citation_language"})['content']

df['institution'] = df.apply(get_institution, axis=1)
df['language'] = df.apply(get_language, axis=1)

In [11]:
# Getting all the journal data that is available though the issn number ...

def get_issn_data ( row ):
  soup = get_page( "https://portal.issn.org/resource/ISSN/" + row["issn"] )
  container = soup.find(attrs={"id" : "tab0"})
  info_container = container.find(attrs={"class" : "item-result-content-text"})
  spans = info_container.find_all('span')
  journal_data={}
  for span in spans:
    attr = span.text
    parent = span.parent
    span.extract()

    attr = attr.replace(":", "").strip().lower()
    value = parent.text.strip().lower()
    if attr not in journal_data:
      journal_data[ attr ]=value
    else:
      if ( type( journal_data[ attr ] ).__name__ == 'list' ):
        journal_data[ attr ].append( value )
      else:
        journal_data[ attr ]=[ journal_data[ attr ], value ]
  return json.dumps(journal_data)

df["issn_data"]=df.apply(get_issn_data, axis=1)

In [12]:
# This should function too, yet I don't know if it's ok to bypass HTTP 403: "Forbidden" by changing the 'User-Agent' ... kinda the only way to get the full text/html of the article ...

# def get_pmc_data (row):
#   if ( row["pmc"] == None ):
#     return None

#   from urllib.request import Request, urlopen
#   site = "https://pmc.ncbi.nlm.nih.gov/articles/PMC"+row["pmc"]+"/"
#   hdr = {'User-Agent': 'Mozilla/5.0'} # Eighter this, or it detects that it is not a regular browser and restricts access ... Is this a problem?
#   req = Request(site,headers=hdr)
#   page = urlopen(req)
#   soup = BeautifulSoup(page, 'html.parser')

#   # container = soup.find(attrs={"class" : "body main-article-body"}) # the article without the title and so on ...
#   article = soup.find("article")
#   return article

# def get_pmc_data_html (row):
#   article = get_pmc_data(row)
#   if ( article != None ):
#     return str(article)
#   else:
#     return None

# def get_pmc_data_text (row):
#   article = get_pmc_data(row)
#   if ( article != None ):
#     return article.get_text()
#   else:
#     return None

# df["full_article_html"]=df.apply(get_pmc_data_html, axis=1)
# df["full_article_text"]=df.apply(get_pmc_data_text, axis=1)

In [13]:
# Remove duplicated (not sure if necessary)

df = df.astype(str)
df.drop_duplicates(keep=False, inplace=True)

In [14]:
# Saving only PMID/citation data to a CSV file. This file is meant for creating a citation network ...
df[["pmid", "references"]].to_csv("/content/drive/MyDrive/citations.csv", encoding='utf-8', index=False)

# Saving the whole dataframe data to a CSV too ...
df.to_csv("/content/drive/MyDrive/extracted_features.csv", encoding='utf-8', index=False)

In [15]:
# Printing the full article data (disable if you have tons of articles) ...

def print_row ( iter ):
  print("{:<20} {:<20}".format("Attribute", "Value")); print("-" * 41)
  for i in range(0, len( df.columns ) ):
    print("{:<20} {:<20}".format(str(df.columns[i]), str(df.iloc[iter].iloc[i])) )
  print("\n\n")

for k in range(0, df.shape[0] ):
  print_row(k)

Attribute            Value               
-----------------------------------------
abstract             BACKGROUND: Hospitals in sub-Saharan Africa are inundated with HIV-infected patients and tuberculosis (TB) is the commonest opportunistic infection in this sub-group. Up to one third of TB-HIV co-infected patients fail to produce a sputum sample (sputum scarce) and diagnosis is thus often delayed or missed. We investigated the sensitivity of urine-based methods (Xpert MTB/RIF, LAM strip test and LAM ELISA) in such patients.
METHODOLOGY/PRINCIPAL FINDINGS: 281 HIV-infected hospitalised patients with clinically suspected TB provided a spot urine sample. The reference standard was culture positivity for Mycobacterium tuberculosis on ≥1 sputum or extra-pulmonary sample. MTB/RIF was performed using 1 ml of both unprocessed and, when possible, concentrated urine. Each unconcentrated urine sample was also tested using the Clearview LAM ELISA and Alere LAM strip test. 42% (116/242) of patie

In [16]:
# So, those are the features that were kept so far ...

print(df.columns.values)

['abstract' 'authors' 'chemicals' 'citation' 'doi' 'history' 'issn'
 'issue' 'journal' 'keywords' 'mesh' 'pmc' 'pmid' 'publication_types'
 'title' 'url' 'volume' 'volume_issue' 'xml' 'year' 'author_first'
 'references' 'institution' 'language' 'issn_data']


In [17]:
# I did not code this and it seemed interesting ... why is there a try/catch? are not all reviews public? might some have been deleted?

# num = 0
# while num < CD012768.shape[0]:
#   pmid = pmids[num]
#   try:
#     article = fetch.article_by_pmid(pmid)
#     titles[pmid] = article.title
#     abstracts[pmid] = article.abstract
#   except:
#     pass
#   else:
#     print(num)
#     num += 1