In [None]:
import bs4, requests, re
from bibtexparser.bparser import BibTexParser

In [37]:
mode_layout = 'flat'
article_id = '3121245'
expformat = 'bibtex'

domain = 'https://dl.acm.org/'
article = 'citation.cfm?id={0}&'.format(article_id)
parametrs = 'preflayout={0}'.format(mode_layout)
url = domain + article + parametrs
# https://dl.acm.org/downformats.cfm?id=2492591&parent_id=&expformat=bibtex
url_download_bibtex = domain + 'downformats.cfm?id={0}&parent_id=&expformat={1}'.format(article_id, expformat)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'}

print(url, url_download_bibtex, sep='\n')

https://dl.acm.org/citation.cfm?id=3121245&preflayout=flat
https://dl.acm.org/downformats.cfm?id=3121245&parent_id=&expformat=bibtex


In [38]:
response = requests.get(url=url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')

bibtex_file = open('bibtex.bib', 'w').write(requests.get(url=url_download_bibtex, headers=headers).text)
bibtex_file = open('bibtex.bib', 'r')
bibtex_dict = BibTexParser(interpolate_strings=False).parse_file(bibtex_file).entries

In [39]:
bibtex_dict

[{'ENTRYTYPE': 'proceedings',
  'ID': 'Vos:2017:3121245',
  'address': 'New York, NY, USA',
  'isbn': '978-1-4503-5155-3',
  'location': 'Paderborn, Germany',
  'publisher': 'ACM',
  'title': 'A-TEST 2017: Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
  'year': '2017'}]

In [40]:
article_data = {}

article_data['id'] = article_id
article_data['url'] = url
article_data['title'] = bibtex_dict[0].get('title', None)
article_data['doi'] = bibtex_dict[0].get('doi', None) 
article_data['year'] = bibtex_dict[0].get('year', None)

article_data, bibtex_dict

({'doi': None,
  'id': '3121245',
  'title': 'A-TEST 2017: Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
  'url': 'https://dl.acm.org/citation.cfm?id=3121245&preflayout=flat',
  'year': '2017'},
 [{'ENTRYTYPE': 'proceedings',
   'ID': 'Vos:2017:3121245',
   'address': 'New York, NY, USA',
   'isbn': '978-1-4503-5155-3',
   'location': 'Paderborn, Germany',
   'publisher': 'ACM',
   'title': 'A-TEST 2017: Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
   'year': '2017'}])

In [41]:
divmain = soup.find('div', id='divmain')

authors_tags = divmain.find_all('td', {'style': 'padding-right:3px;', 'valign': 'top',  'nowrap': 'nowrap'})
authors = []

for author_tag in authors_tags:
    name = author_tag.text
    authors.append({'name': name.strip(),
                    'url': domain + author_tag.find('a')['href']})
    
article_data['authors'] = authors
article_data

{'authors': [{'name': 'Tanja Vos',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100095172&coll=DL&dl=ACM&trk=0'},
  {'name': 'Sigrid Eldh',
   'url': 'https://dl.acm.org/author_page.cfm?id=81317493250&coll=DL&dl=ACM&trk=0'},
  {'name': 'Wishnu Prasetya',
   'url': 'https://dl.acm.org/author_page.cfm?id=81385600565&coll=DL&dl=ACM&trk=0'}],
 'doi': None,
 'id': '3121245',
 'title': 'A-TEST 2017: Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
 'url': 'https://dl.acm.org/citation.cfm?id=3121245&preflayout=flat',
 'year': '2017'}

In [42]:
layout = soup.find('div', {'class': 'layout'})
flatbody = layout.find('div', {'class': 'flatbody'})

abstract = flatbody.text.strip()
article_data['abstract'] = abstract

article_data

{'abstract': 'An abstract is not available.',
 'authors': [{'name': 'Tanja Vos',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100095172&coll=DL&dl=ACM&trk=0'},
  {'name': 'Sigrid Eldh',
   'url': 'https://dl.acm.org/author_page.cfm?id=81317493250&coll=DL&dl=ACM&trk=0'},
  {'name': 'Wishnu Prasetya',
   'url': 'https://dl.acm.org/author_page.cfm?id=81385600565&coll=DL&dl=ACM&trk=0'}],
 'doi': None,
 'id': '3121245',
 'title': 'A-TEST 2017: Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
 'url': 'https://dl.acm.org/citation.cfm?id=3121245&preflayout=flat',
 'year': '2017'}

In [43]:
td = soup.find('td', string='Conference')

if td:
    td = td.nextSibling.nextSibling
    venue = td.strong.text.strip()
    url_conference = td.a['href']
else:
    venue = None
    url_conference = 'None'

article_data['venue'] = {'name': venue,
                        'url': domain + url_conference}

article_data

{'abstract': 'An abstract is not available.',
 'authors': [{'name': 'Tanja Vos',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100095172&coll=DL&dl=ACM&trk=0'},
  {'name': 'Sigrid Eldh',
   'url': 'https://dl.acm.org/author_page.cfm?id=81317493250&coll=DL&dl=ACM&trk=0'},
  {'name': 'Wishnu Prasetya',
   'url': 'https://dl.acm.org/author_page.cfm?id=81385600565&coll=DL&dl=ACM&trk=0'}],
 'doi': None,
 'id': '3121245',
 'title': 'A-TEST 2017: Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
 'url': 'https://dl.acm.org/citation.cfm?id=3121245&preflayout=flat',
 'venue': {'name': 'FSE', 'url': 'https://dl.acm.org/event.cfm?id=RE201'},
 'year': '2017'}

In [45]:
flatbody = soup.find_all('div', {'class': 'flatbody'})
cited_by = []
for a in flatbody[3].find_all('a'):
    cited_by.append(a['href'].split('=')[-1])
    
article_data['cited_by'] = cited_by

article_data

{'abstract': 'An abstract is not available.',
 'authors': [{'name': 'Tanja Vos',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100095172&coll=DL&dl=ACM&trk=0'},
  {'name': 'Sigrid Eldh',
   'url': 'https://dl.acm.org/author_page.cfm?id=81317493250&coll=DL&dl=ACM&trk=0'},
  {'name': 'Wishnu Prasetya',
   'url': 'https://dl.acm.org/author_page.cfm?id=81385600565&coll=DL&dl=ACM&trk=0'}],
 'cited_by': [],
 'doi': None,
 'id': '3121245',
 'title': 'A-TEST 2017: Proceedings of the 8th ACM SIGSOFT International Workshop on Automated Software Testing',
 'url': 'https://dl.acm.org/citation.cfm?id=3121245&preflayout=flat',
 'venue': {'name': 'FSE', 'url': 'https://dl.acm.org/event.cfm?id=RE201'},
 'year': '2017'}