In [2]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
import urllib.request, urllib.parse, urllib.error
from urllib3.util.retry import Retry
import re
import ssl
import json
import numpy as np
import pandas as pd
import time
import math
from collections import Counter
import string
import codecs
from itertools import chain
from google.colab import files
# import Levenshtein as lev

In [12]:
#  take a soup and extracts all needed elements for the bibliography and abstract
def get_bibliography(soup):
    
    article = soup.find('article')

##################### country #########################
    country = ''
    if soup.find('country'):
      country = soup.find('country').text
    
###################### authors ########################
    authors = []
    authorlist = soup.find('authorlist')
    if authorlist:
      for i in range(len(authorlist.find_all('author'))):
          currentAuthor = authorlist.find_all('author')[i]
          
          ForeName = ''
          if currentAuthor.find('forename'):
            ForeName = currentAuthor.find('forename').text 
          
          lastname = ''
          if currentAuthor.find('lastname'):
            lastname = currentAuthor.find('lastname').text

          initial = ''
          if currentAuthor.find('initials'):
              initial = currentAuthor.find('initials').text

          Affiliation = []
          if currentAuthor.find('affiliation'):
            for i in range(len(currentAuthor.find_all('affiliation'))):
              # keyword = keywordlist.find_all('keyword')[i].text
              Affiliation.append(currentAuthor.find_all('affiliation')[i].text)

          author={'lastname':lastname,'ForeName':ForeName,'initial':initial,'Affiliation':Affiliation }
          authors.append(author)


##################### keywords #########################
    keywords = []
    keywordlist = soup.find('keywordlist')
    if keywordlist:
        for i in range(len(keywordlist.find_all('keyword'))):
            keyword = keywordlist.find_all('keyword')[i].text
            #  keyword = keyword.replace('\u2013','-').replace('\u00e2\u20ac\u201c','-')
            keywords.append(str(keyword))


###################### references ########################
    references = []
    referencelist = soup.find('referencelist')
    if referencelist:
        for i in range(len(referencelist.find_all('articleidlist'))):
          reference = {
              "pubmed": "",
              "doi": ""
              }
          # print(referencelist.find_all('articleidlist')[i].parent.name)
          if(referencelist.find_all('articleidlist')[i].find('articleid', {'idtype':'pubmed'})):
            reference["pubmed"] = referencelist.find_all('articleidlist')[i].find('articleid', {'idtype':'pubmed'}).text 
          if(referencelist.find_all('articleidlist')[i].find('articleid', {'idtype':'doi'})):
            reference["doi"] = referencelist.find_all('articleidlist')[i].find('articleid', {'idtype':'doi'}).text
          references.append(reference)


#################### ArticleTitle ##########################
    ArticleTitle = ''
    if article:
        ArticleTitle = article.find('articletitle').text
        # print(ArticleTitle)


###################### journal_title ########################
    journal_title = ''
    if soup.find('title'):
        journal_title = soup.find('title').text
    

###################### year ########################
    year=''
    if(soup.find('pubdate')):
      if(soup.find('pubdate').find('year')):
        year = soup.find('pubdate').find('year').text
    elif soup.find('pubmedpubdate'):
      if(soup.find('pubmedpubdate').find('year')):
        year = soup.find('pubmedpubdate').find('year').text


##################### pubmed & doi #########################
    pubmed = ''
    doi = ''
    if soup.find('articleidlist'):
      
      if (soup.find('articleidlist').find('articleid', {'idtype':'pubmed'}) and soup.find('articleidlist').parent.name == 'pubmeddata'):
        pubmed = soup.find('articleidlist').find('articleid', {'idtype':'pubmed'}).text
          
      if (soup.find('articleidlist').find('articleid', {'idtype':'doi'}) and soup.find('articleidlist').parent.name == 'pubmeddata'):
        doi = soup.find('articleidlist').find('articleid', {'idtype':'doi'}).text


##############################################            
    result = {
        'ArticleTitle':ArticleTitle,
        'journal_title':journal_title,
        'pubmed':pubmed,
        'doi':doi,
        'country':country,
        'authors':authors,
        'references':references,
        'keywords':keywords,
        'year':year
        
    }
    return result





def fetchData(id):

    time.sleep(0.5)
    
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=itemid"
    url = url.replace('itemid', id)

    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        # Legacy Python that doesn’t verify HTTPS certificates by default
        pass
    else:
        # Handle target environment that doesn’t support HTTPS verification
        ssl._create_default_https_context = _create_unverified_https_context
      
    r = ''
    while r == '':
      try:
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        response = session.get(url)
        break
      except Exception as e:
        print('fetching error:: ', str(e))
        print("Connection refused by the server..")
        print("sleep for 2 seconds")
        time.sleep(2)
        print("continue...")

    page_xml = response.content
    soup = BeautifulSoup(page_xml, "html.parser")
   
    articleData = get_bibliography(soup)
     
    return articleData

In [None]:
if __name__ == "__main__":
  url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=NUM&sort=relevance&term=KEYWORD"

  # ask the user to provide the keyword and number of results and subsequently replace these elements in the url string
  keyword = str(input('Please enter the keyword '))
  num = int(input('Please enter the number of results '))
  url = url.replace('NUM', str(num))
  url = url.replace('KEYWORD', keyword)


  try:
      _create_unverified_https_context = ssl._create_unverified_context
  except AttributeError:
      # Legacy Python that doesn’t verify HTTPS certificates by default
      pass
  else:
      # Handle target environment that doesn’t support HTTPS verification
      ssl._create_default_https_context = _create_unverified_https_context

  webpage = urllib.request.urlopen(url).read()
  dict_page =json.loads(webpage)
  idlist = dict_page["esearchresult"]["idlist"]

  fold = math.ceil(num/2)
  for i in range(2):
    selected_idlist = idlist[fold*i:fold*(i+1)] 
    articles_list = []
    
    for id in selected_idlist:
      # time.sleep(1)
      counter = 0
      article = {"pubmed":""}
      while article["pubmed"] == "":
        if counter < 5:
          article = fetchData(id)
          counter == counter+1
        else:
          break
      print(selected_idlist.index(id)+fold*i," : " , id)
      article['id'] =str(selected_idlist.index(id)+fold*i)
      articles_list.append(article)

    # Serializing json
    json_object = json.dumps(articles_list, indent=4)

    # Writing to sample.json
    file_name = str(fold*i)+"to"+ str(fold*(i+1)) + '.json'
    with open(file_name, "w") as outfile:
        outfile.write(json_object)
      
    files.download(file_name) 
      

# pre processing the results


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
data = []
with open('/content/drive/My Drive/Colab Notebooks/data_4262/covid19_ml_4262.json') as f:
    text = f.read()
    data= json.loads(text)

In [None]:
# check if the article has our keywords either in its title or keywordlist 
kw = 'covid machine learning'
filterByKwData=[]

for item in data:
   
    kwsplit= kw.split(' ')
    itemKwList = ' '.join(item['keywords']).lower()
    titlecounter = 0
    kwcounter = 0
    for kwitem in kwsplit:
        if kwitem in item['ArticleTitle'].lower():
            titlecounter = titlecounter + 1
        if kwitem in itemKwList:
            kwcounter = kwcounter + 1
    if titlecounter >=2 or kwcounter >=2:
        filterByKwData.append(item) 




print('included: ', len(filterByKwData))
json_object = json.dumps(filterByKwData, indent=4)
included_file_name = 'included_'+str(len(filterByKwData))+'.json'
with open(included_file_name, "w") as outfile:
    outfile.write(json_object)
files.download(included_file_name) 


# get sublist which doesnt have the keyword
not_included = [x for x in data if x not in filterByKwData]
print('not included: ', len(not_included))
json_object = json.dumps(not_included, indent=4)
not_included_file_name = 'not_included_'+str(len(not_included))+'.json'
with open(not_included_file_name, "w") as outfile:
    outfile.write(json_object)
files.download(not_included_file_name) 


# extract list of articlea with empty keyword for further review
included_null_keywords = [x for x in filterByKwData if x['keywords']== []]
print('include null keyword: ', len(included_null_keywords))
json_object = json.dumps(included_null_keywords, indent=4)
included_null_keywords_file_name = 'not_included_'+str(len(included_null_keywords))+'.json'
with open(included_null_keywords_file_name, "w") as outfile:
    outfile.write(json_object)
files.download(included_null_keywords_file_name) 

In [None]:
# check number af empty elements in our data
allitems=0
nullArticleTitle=[]
nulljournal_title=[]
nullpubmed=[]
nullcountry=[]
nulldoi_pii_str=[]
nullauthors=[]
nullreferences=[]
nullkeywords=[]
nullrfkw=[]
nullyear=[]
for item in filterByKwData:
    allitems = allitems + 1

    if item['ArticleTitle']=='':
        nullArticleTitle.append(item)

    if item['journal_title']=='':
        nulljournal_title.append(item)

    if item['pubmed']=='':
        nullpubmed.append(item)

    if item['country']=='':
        nullcountry.append(item)

    if len(item['authors']) ==0:
        nullauthors.append(item)

    if len(item['references']) ==0:
        nullreferences.append(item)

    if len(item['keywords']) ==0:
        nullkeywords.append(item)

    if len(item['keywords']) ==0 and len(item['references']) ==0:
        nullrfkw.append(item)

    if item['year'] =='':
        nullyear.append(item)

        
print('all : ', allitems)
print('nullArticleTitle : ', len(nullArticleTitle))
print('nulljournal_title : ', len(nulljournal_title))
print('nullpubmed : ', len(nullpubmed))
print('nullcountry : ', len(nullcountry))
print('nulldoi_pii_str : ', len(nulldoi_pii_str))
print('nullauthors : ', len(nullauthors))
print('nullreferences : ', len(nullreferences))
print('nullkeywords : ', len(nullkeywords))
print('nullyear : ', len(nullyear))

In [None]:
# extract list of unique countries with their frequently distribution
all_countries=[item['country'] for item in  filterByKwData]
countries_distribution = Counter(all_countries)
all_unique_countries = np.unique(all_countries)
countries_distribution_num = len(countries_distribution.keys())  
countries_distribution_df = pd.DataFrame(columns=['Key', 'Value'])
for i,j,k in zip(range(countries_distribution_num), countries_distribution.keys(), countries_distribution.values()):
    countries_distribution_df.loc[i] = [j, k]
countries_distribution_df.to_csv('countries_distribution.csv') 
files.download('countries_distribution.csv')


# extract list of unique journals with their frequently distribution
all_journals = [item['journal_title'] for item in  filterByKwData]
journals_distribution = Counter(all_journals)
all_unique_journals = np.unique(all_journals)
journals_distribution_num = len(journals_distribution.keys())  
journals_distribution_df = pd.DataFrame(columns=['Key', 'Value'])
for i,j,k in zip(range(journals_distribution_num), journals_distribution.keys(), journals_distribution.values()):
    journals_distribution_df.loc[i] = [j, k]
journals_distribution_df.to_csv('journals_distribution.csv') 
files.download('journals_distribution.csv')


# extract list of unique years with their frequently distribution
all_years = [item['year'] for item in  filterByKwData]
years_distribution = {i:all_years.count(i) for i in all_years}
years_distribution = Counter(all_years)
years_distribution_num = len(years_distribution.keys())  
years_distribution_df = pd.DataFrame(columns=['Key', 'Value'])
for i,j,k in zip(range(years_distribution_num), years_distribution.keys(), years_distribution.values()):
    years_distribution_df.loc[i] = [j, k]
years_distribution_df.to_csv('years_distribution.csv') 
files.download('years_distribution.csv')


# extract list of unique keyword with their frequently distribution
keywordslist = []
keywordslist_with_articleid = []
for item in filterByKwData:
    for kw_item in item['keywords']:
        kw = kw_item.lower()
        keywordslist.append(kw)
        keywordslist_with_articleid.append((item['pubmed'] , kw))

unique_keywordslist = [pair[0] for pair in sorted(Counter(keywordslist).items(), key=lambda item: item[1], reverse=True)]
keyword_distribution = Counter(keywordslist)

keyword_distribution_num = len(keyword_distribution.keys())  
keyword_distribution_df = pd.DataFrame(columns=['Key', 'Value'])
for i,j,k in zip(range(keyword_distribution_num), keyword_distribution.keys(), keyword_distribution.values()):
    keyword_distribution_df.loc[i] = [j, k]
keyword_distribution_df.to_csv('keyword_distribution.csv') 
files.download('keyword_distribution.csv')


# extract all keywords in data with its article pubmed id for further preprocessing like detecting semanticaly similar keywords 
keywordslist_with_articleid_df = pd.DataFrame(keywordslist_with_articleid, columns=('articlePubmedId', 'keyword'))
keywordslist_with_articleid_df.to_csv('keywordslist_with_articleid.csv') 
files.download('keywordslist_with_articleid.csv')



print(countries_distribution)
print(journals_distribution)
print(years_distribution)
print(Counter(keywordslist))
print(len(unique_keywordslist))

In [25]:
# extract last item of authors affiliation splitted by comma (which is mostly their country) in order to detect co-countries network
countrylist = []
countrylist_with_articleid = []
for item in filterByKwData:
  for author in item['authors']:
    for x in author['Affiliation']:
      affiliation_item = {
          "pubmed":item["pubmed"], 
          "firstAffil": x.split(',')[len(x.split(','))-1].replace('.', '').strip().lower(), 
          "secondAffil": x.split(',')[len(x.split(','))-2].strip().lower()
          } 
      countrylist_with_articleid.append(affiliation_item)

countrylist_with_articleid_df = pd.DataFrame(countrylist_with_articleid)
countrylist_with_articleid_df.to_csv('countrylist_with_articleid.csv') 
files.download('countrylist_with_articleid.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
#keywordlist_with_articleid co-keyword analysis source target extraction
keywordslist_with_articleid = []
with open('/content/drive/My Drive/Colab Notebooks/data_4262/keywordlist_with_articleid.csv') as file:
    keywordslist_with_articleid = pd.read_csv(file)
keywordslist_with_articleid.head()

In [None]:
kwdf_pubmed = list(keywordslist_with_articleid['pubmed'].unique())
kwdf_keyword = list(keywordslist_with_articleid['keyword'].unique())
kw_lst = list(keywordslist_with_articleid)
co_keywords = []
for id in kwdf_pubmed:
  id_kws = list(keywordslist_with_articleid.loc[keywordslist_with_articleid['pubmed'] == id]['keyword'])
  for index1, kw1 in enumerate(id_kws):
    for kw2 in id_kws[index1+1:]:
      co_keywords.append((kw1.strip(),kw2.strip()))

print(len(co_keywords))

In [None]:
print(Counter(co_keywords).most_common())
kw_df1 = pd.DataFrame(Counter(co_keywords).most_common(), columns=["tuple", "weight"])
final_kw_df = pd.DataFrame(list(kw_df1['tuple']), columns=["source", "target"])
final_kw_df["weight"] = kw_df1["weight"]
final_kw_df.head()
final_kw_df.to_csv('co_keyword.csv') 
files.download('co_keyword.csv')

In [None]:
# try to co-keyword from large json data  itself which took so long 
lst = keywordslist_with_articleid
co_keywors_list = []

for index1, kw1 in enumerate(lst):
  for kw2 in lst[index1+1:]:
    print(index1)
    weight = 0
    for item in filterByKwData:
      itemKwList = item["keywords"]
      if kw1 in itemKwList and kw2 in itemKwList:
        weight +=1
    if weight > 0:
      co_keywors_list.append((kw1,kw2, weight))

# kw_df = pd.DataFrame(co_keywors_list, columns=('kw1', 'kw2', 'weight'))
# kw_df.to_csv('co_keywords.csv') 
# files.download('co_keywords.csv')


In [None]:
# try levenshtain algorithm 
def clean_text(txt):
    txt = txt.strip()
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    return txt.lower()


for kw1 in unique_keywordslist[:100]:
    # print("kw1: ", kw1)
    for kw2 in unique_keywordslist[:100]:
        kw1 = clean_text(kw1)
        kw2 = clean_text(kw2)
        if lev.distance(kw1, kw2) > 0 and lev.distance(kw1, kw2) <=5:
            print(kw1," vs ", kw2 ," : ", lev.distance(kw1, kw2))
            print("delete: ", kw2)
            try:
                unique_keywordslist.remove(kw2)
            except:
                continue

print(len(unique_keywordslist))
