In [8]:
import bs4, requests, re
from bibtexparser.bparser import BibTexParser

In [19]:
mode_layout = 'flat'
article_id = '24216'
expformat = 'bibtex'

domain = 'https://dl.acm.org/'
article = 'citation.cfm?id={0}&'.format(article_id)
parametrs = 'preflayout={0}'.format(mode_layout)
url = domain + article + parametrs
# https://dl.acm.org/downformats.cfm?id=2492591&parent_id=&expformat=bibtex
url_download_bibtex = domain + 'downformats.cfm?id={0}&parent_id=&expformat={1}'.format(article_id, expformat)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'}

print(url, url_download_bibtex, sep='\n')

https://dl.acm.org/citation.cfm?id=24216&preflayout=flat
https://dl.acm.org/downformats.cfm?id=24216&parent_id=&expformat=bibtex


In [10]:
response = requests.get(url=url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')

bibtex_file = open('bibtex.bib', 'w').write(requests.get(url=url_download_bibtex, headers=headers).text)
bibtex_file = open('bibtex.bib', 'r')
bibtex_dict = BibTexParser(interpolate_strings=False).parse_file(bibtex_file).entries

In [11]:
bibtex_dict

[{'ENTRYTYPE': 'inproceedings',
  'ID': 'Parsai:2017:DMS:3121245.3121249',
  'acmid': '3121249',
  'address': 'New York, NY, USA',
  'author': 'Parsai, Ali and Demeyer, Serge',
  'booktitle': 'Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
  'doi': '10.1145/3121245.3121249',
  'isbn': '978-1-4503-5155-3',
  'keywords': 'Dynamic Mutant Subsumption, Mutant Subsumption, Mutation Testing, Software Testing',
  'location': 'Paderborn, Germany',
  'numpages': '4',
  'pages': '1--4',
  'publisher': 'ACM',
  'series': 'A-TEST 2017',
  'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
  'url': 'http://doi.acm.org/10.1145/3121245.3121249',
  'year': '2017'}]

In [12]:
article_data = {}

article_data['id'] = article_id
article_data['url'] = url
article_data['title'] = bibtex_dict[0].get('title', None)
article_data['doi'] = bibtex_dict[0].get('doi', None) 
article_data['year'] = bibtex_dict[0].get('year', None)

article_data, bibtex_dict

({'doi': '10.1145/3121245.3121249',
  'id': '3121249',
  'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
  'url': 'https://dl.acm.org/citation.cfm?id=3121249&preflayout=flat',
  'year': '2017'},
 [{'ENTRYTYPE': 'inproceedings',
   'ID': 'Parsai:2017:DMS:3121245.3121249',
   'acmid': '3121249',
   'address': 'New York, NY, USA',
   'author': 'Parsai, Ali and Demeyer, Serge',
   'booktitle': 'Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
   'doi': '10.1145/3121245.3121249',
   'isbn': '978-1-4503-5155-3',
   'keywords': 'Dynamic Mutant Subsumption, Mutant Subsumption, Mutation Testing, Software Testing',
   'location': 'Paderborn, Germany',
   'numpages': '4',
   'pages': '1--4',
   'publisher': 'ACM',
   'series': 'A-TEST 2017',
   'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
   'url': 'http://doi.acm.org/10.1145/3121245.3121249',
   'year': '2017'}])

In [13]:
divmain = soup.find('div', id='divmain')

authors_tags = divmain.find_all('td', {'style': 'padding-right:3px;', 'valign': 'top',  'nowrap': 'nowrap'})
authors = []

for author_tag in authors_tags:
    name = author_tag.text
    authors.append({'name': name.strip(),
                    'url': domain + author_tag.find('a')['href']})
    
article_data['authors'] = authors
article_data

{'authors': [{'name': 'Ali Parsai',
   'url': 'https://dl.acm.org/author_page.cfm?id=99658715317&coll=DL&dl=ACM&trk=0'},
  {'name': 'Serge Demeyer',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100645445&coll=DL&dl=ACM&trk=0'}],
 'doi': '10.1145/3121245.3121249',
 'id': '3121249',
 'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
 'url': 'https://dl.acm.org/citation.cfm?id=3121249&preflayout=flat',
 'year': '2017'}

In [14]:
layout = soup.find('div', {'class': 'layout'})
flatbody = layout.find('div', {'class': 'flatbody'})

abstract = flatbody.text.strip()
article_data['abstract'] = abstract

article_data

{'abstract': 'Many academic studies in the field of software testing rely on mutation testing to use as their comparison criteria. However, recent studies have shown that redundant mutants have a significant effect on the accuracy of their results. One solution to this problem is to use mutant subsumption to detect redundant mutants. Therefore, in order to facilitate research in this field, a mutation testing tool that is capable of detecting redundant mutants is needed. In this paper, we describe how we improved our tool, LittleDarwin, to fulfill this requirement.',
 'authors': [{'name': 'Ali Parsai',
   'url': 'https://dl.acm.org/author_page.cfm?id=99658715317&coll=DL&dl=ACM&trk=0'},
  {'name': 'Serge Demeyer',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100645445&coll=DL&dl=ACM&trk=0'}],
 'doi': '10.1145/3121245.3121249',
 'id': '3121249',
 'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
 'url': 'https://dl.acm.org/citation.cfm?id=3121249&preflayout=flat',


In [17]:
td = soup.find('td', string='Conference')

if td:
    td = td.nextSibling.nextSibling
    venue = td.strong.text.strip()
    url_conference = td.a['href']
else:
    venue = None
    url_conference = 'None'

article_data['venue'] = {'name': venue,
                        'url': domain + url_conference}

article_data

{'abstract': 'Many academic studies in the field of software testing rely on mutation testing to use as their comparison criteria. However, recent studies have shown that redundant mutants have a significant effect on the accuracy of their results. One solution to this problem is to use mutant subsumption to detect redundant mutants. Therefore, in order to facilitate research in this field, a mutation testing tool that is capable of detecting redundant mutants is needed. In this paper, we describe how we improved our tool, LittleDarwin, to fulfill this requirement.',
 'authors': [{'name': 'Ali Parsai',
   'url': 'https://dl.acm.org/author_page.cfm?id=99658715317&coll=DL&dl=ACM&trk=0'},
  {'name': 'Serge Demeyer',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100645445&coll=DL&dl=ACM&trk=0'}],
 'cited_by': [],
 'doi': '10.1145/3121245.3121249',
 'id': '3121249',
 'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
 'url': 'https://dl.acm.org/citation.cfm?id=3121249&p

In [16]:
flatbody = soup.find_all('div', {'class': 'flatbody'})
cited_by = []
for a in flatbody[3].find_all('a'):
    cited_by.append(a['href'].split('=')[-1])
    
article_data['cited_by'] = cited_by

article_data

{'abstract': 'Many academic studies in the field of software testing rely on mutation testing to use as their comparison criteria. However, recent studies have shown that redundant mutants have a significant effect on the accuracy of their results. One solution to this problem is to use mutant subsumption to detect redundant mutants. Therefore, in order to facilitate research in this field, a mutation testing tool that is capable of detecting redundant mutants is needed. In this paper, we describe how we improved our tool, LittleDarwin, to fulfill this requirement.',
 'authors': [{'name': 'Ali Parsai',
   'url': 'https://dl.acm.org/author_page.cfm?id=99658715317&coll=DL&dl=ACM&trk=0'},
  {'name': 'Serge Demeyer',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100645445&coll=DL&dl=ACM&trk=0'}],
 'cited_by': [],
 'doi': '10.1145/3121245.3121249',
 'id': '3121249',
 'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
 'url': 'https://dl.acm.org/citation.cfm?id=3121249&p