In [1]:
import bs4, requests, re
from bibtexparser.bparser import BibTexParser

In [2]:
mode_layout = 'flat'
article_id = '1882308'
expformat = 'bibtex'

domain = 'https://dl.acm.org/'
article = 'citation.cfm?id={0}&'.format(article_id)
parametrs = 'preflayout={0}'.format(mode_layout)
url = domain + article + parametrs
# https://dl.acm.org/downformats.cfm?id=2492591&parent_id=&expformat=bibtex
url_download_bibtex = domain + 'downformats.cfm?id={0}&parent_id=&expformat={1}'.format(article_id, expformat)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'}

print(url, url_download_bibtex, sep='\n')

https://dl.acm.org/citation.cfm?id=1882308&preflayout=flat
https://dl.acm.org/downformats.cfm?id=1882308&parent_id=&expformat=bibtex


In [3]:
response = requests.get(url=url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')

bibtex_file = open('bibtex.bib', 'w').write(requests.get(url=url_download_bibtex, headers=headers).text)
bibtex_file = open('bibtex.bib', 'r')
bibtex_dict = BibTexParser(interpolate_strings=False).parse_file(bibtex_file).entries

In [4]:
bibtex_dict

[{'ENTRYTYPE': 'inproceedings',
  'ID': 'Bachmann:2010:MLB:1882291.1882308',
  'acmid': '1882308',
  'address': 'New York, NY, USA',
  'author': 'Bachmann, Adrian and Bird, Christian and Rahman, Foyzur and Devanbu, Premkumar and Bernstein, Abraham',
  'booktitle': 'Proceedings of the Eighteenth ACM SIGSOFT International Symposium on Foundations of Software Engineering',
  'doi': '10.1145/1882291.1882308',
  'isbn': '978-1-60558-791-2',
  'keywords': 'apache, bias, case study, manual annotation, tool',
  'location': 'Santa Fe, New Mexico, USA',
  'numpages': '10',
  'pages': '97--106',
  'publisher': 'ACM',
  'series': "FSE '10",
  'title': 'The Missing Links: Bugs and Bug-fix Commits',
  'url': 'http://doi.acm.org/10.1145/1882291.1882308',
  'year': '2010'}]

In [5]:
article_data = {}

article_data['id'] = article_id
article_data['url'] = url
article_data['title'] = bibtex_dict[0].get('title', None)
article_data['doi'] = bibtex_dict[0].get('doi', None) 
article_data['year'] = bibtex_dict[0].get('year', None)

article_data, bibtex_dict

({'doi': '10.1145/1882291.1882308',
  'id': '1882308',
  'title': 'The Missing Links: Bugs and Bug-fix Commits',
  'url': 'https://dl.acm.org/citation.cfm?id=1882308&preflayout=flat',
  'year': '2010'},
 [{'ENTRYTYPE': 'inproceedings',
   'ID': 'Bachmann:2010:MLB:1882291.1882308',
   'acmid': '1882308',
   'address': 'New York, NY, USA',
   'author': 'Bachmann, Adrian and Bird, Christian and Rahman, Foyzur and Devanbu, Premkumar and Bernstein, Abraham',
   'booktitle': 'Proceedings of the Eighteenth ACM SIGSOFT International Symposium on Foundations of Software Engineering',
   'doi': '10.1145/1882291.1882308',
   'isbn': '978-1-60558-791-2',
   'keywords': 'apache, bias, case study, manual annotation, tool',
   'location': 'Santa Fe, New Mexico, USA',
   'numpages': '10',
   'pages': '97--106',
   'publisher': 'ACM',
   'series': "FSE '10",
   'title': 'The Missing Links: Bugs and Bug-fix Commits',
   'url': 'http://doi.acm.org/10.1145/1882291.1882308',
   'year': '2010'}])

In [19]:
divmain = soup.find('div', id='divmain')

authors_tags = divmain.find_all('a', href=re.compile('author_page.cfm\?id=*'))
authors_and_affiliations = []

affiliation_tags = divmain.find_all('a', href=re.compile('inst_page.cfm\?id=*'))
affiliations = []

for author, affiliation in zip(authors_tags, affiliation_tags):
    authors_and_affiliations.append({'name': re.sub('[\'\']', '', repr(author.text.strip())),
                    'url': domain + author['href'],
                    'affiliation': {'name': re.sub('[\'\']', '', repr(affiliation.text.strip())),
                                    'url': domain + affiliation['href']}})

authors_and_affiliations

[{'affiliation': {'name': 'University of Zurich, Zurich, Switzerland',
   'url': 'https://dl.acm.org/inst_page.cfm?id=60012614'},
  'name': 'Adrian Bachmann',
  'url': 'https://dl.acm.org/author_page.cfm?id=81442607702&coll=DL&dl=ACM&trk=0'},
 {'affiliation': {'name': 'University of California, Davis, CA, USA',
   'url': 'https://dl.acm.org/inst_page.cfm?id=60014439'},
  'name': 'Christian Bird',
  'url': 'https://dl.acm.org/author_page.cfm?id=81450592307&coll=DL&dl=ACM&trk=0'},
 {'affiliation': {'name': 'University of California, Davis, CA, USA',
   'url': 'https://dl.acm.org/inst_page.cfm?id=60014439'},
  'name': 'Foyzur Rahman',
  'url': 'https://dl.acm.org/author_page.cfm?id=81472650950&coll=DL&dl=ACM&trk=0'},
 {'affiliation': {'name': 'University of California, Davis, CA, USA',
   'url': 'https://dl.acm.org/inst_page.cfm?id=60014439'},
  'name': 'Premkumar Devanbu',
  'url': 'https://dl.acm.org/author_page.cfm?id=81452608115&coll=DL&dl=ACM&trk=0'},
 {'affiliation': {'name': 'Unive

In [14]:
layout = soup.find('div', {'class': 'layout'})
flatbody = layout.find('div', {'class': 'flatbody'})

abstract = flatbody.text.strip()
article_data['abstract'] = abstract

article_data

{'abstract': 'Many academic studies in the field of software testing rely on mutation testing to use as their comparison criteria. However, recent studies have shown that redundant mutants have a significant effect on the accuracy of their results. One solution to this problem is to use mutant subsumption to detect redundant mutants. Therefore, in order to facilitate research in this field, a mutation testing tool that is capable of detecting redundant mutants is needed. In this paper, we describe how we improved our tool, LittleDarwin, to fulfill this requirement.',
 'authors': [{'name': 'Ali Parsai',
   'url': 'https://dl.acm.org/author_page.cfm?id=99658715317&coll=DL&dl=ACM&trk=0'},
  {'name': 'Serge Demeyer',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100645445&coll=DL&dl=ACM&trk=0'}],
 'doi': '10.1145/3121245.3121249',
 'id': '3121249',
 'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
 'url': 'https://dl.acm.org/citation.cfm?id=3121249&preflayout=flat',


In [17]:
td = soup.find('td', string='Conference')

if td:
    td = td.nextSibling.nextSibling
    venue = td.strong.text.strip()
    url_conference = td.a['href']
else:
    venue = None
    url_conference = 'None'

article_data['venue'] = {'name': venue,
                        'url': domain + url_conference}

article_data

{'abstract': 'Many academic studies in the field of software testing rely on mutation testing to use as their comparison criteria. However, recent studies have shown that redundant mutants have a significant effect on the accuracy of their results. One solution to this problem is to use mutant subsumption to detect redundant mutants. Therefore, in order to facilitate research in this field, a mutation testing tool that is capable of detecting redundant mutants is needed. In this paper, we describe how we improved our tool, LittleDarwin, to fulfill this requirement.',
 'authors': [{'name': 'Ali Parsai',
   'url': 'https://dl.acm.org/author_page.cfm?id=99658715317&coll=DL&dl=ACM&trk=0'},
  {'name': 'Serge Demeyer',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100645445&coll=DL&dl=ACM&trk=0'}],
 'cited_by': [],
 'doi': '10.1145/3121245.3121249',
 'id': '3121249',
 'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
 'url': 'https://dl.acm.org/citation.cfm?id=3121249&p

In [16]:
flatbody = soup.find_all('div', {'class': 'flatbody'})
cited_by = []
for a in flatbody[3].find_all('a'):
    cited_by.append(a['href'].split('=')[-1])
    
article_data['cited_by'] = cited_by

article_data

{'abstract': 'Many academic studies in the field of software testing rely on mutation testing to use as their comparison criteria. However, recent studies have shown that redundant mutants have a significant effect on the accuracy of their results. One solution to this problem is to use mutant subsumption to detect redundant mutants. Therefore, in order to facilitate research in this field, a mutation testing tool that is capable of detecting redundant mutants is needed. In this paper, we describe how we improved our tool, LittleDarwin, to fulfill this requirement.',
 'authors': [{'name': 'Ali Parsai',
   'url': 'https://dl.acm.org/author_page.cfm?id=99658715317&coll=DL&dl=ACM&trk=0'},
  {'name': 'Serge Demeyer',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100645445&coll=DL&dl=ACM&trk=0'}],
 'cited_by': [],
 'doi': '10.1145/3121245.3121249',
 'id': '3121249',
 'title': 'Dynamic Mutant Subsumption Analysis Using LittleDarwin',
 'url': 'https://dl.acm.org/citation.cfm?id=3121249&p