In [1]:
import bs4, requests, re
from bibtexparser.bparser import BibTexParser

In [2]:
mode_layout = 'flat'
article_id = '2786848'
expformat = 'bibtex'

domain = 'https://dl.acm.org/'
article = 'citation.cfm?id={0}&'.format(article_id)
parametrs = 'preflayout={0}'.format(mode_layout)
url = domain + article + parametrs
# https://dl.acm.org/downformats.cfm?id=2492591&parent_id=&expformat=bibtex
url_download_bibtex = domain + 'downformats.cfm?id={0}&parent_id=&expformat={1}'.format(article_id, expformat)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'}

print(url, url_download_bibtex, sep='\n')

https://dl.acm.org/citation.cfm?id=2786848&preflayout=flat
https://dl.acm.org/downformats.cfm?id=2786848&parent_id=&expformat=bibtex


In [3]:
response = requests.get(url=url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')

bibtex_file = open('bibtex.bib', 'w').write(requests.get(url=url_download_bibtex, headers=headers).text)
bibtex_file = open('bibtex.bib', 'r')
bibtex_dict = BibTexParser(interpolate_strings=False).parse_file(bibtex_file).entries

In [4]:
bibtex_dict

[{'ENTRYTYPE': 'inproceedings',
  'ID': 'Pereira:2010:RFA:1808920.1808928',
  'acmid': '1808928',
  'address': 'New York, NY, USA',
  'author': "Pereira, Tha\\'{\\i}s Alves Burity and dos Santos, Vinicius Souza and Ribeiro, Bruno Luna and Elias, Gl\\^{e}dson",
  'booktitle': 'Proceedings of the 2Nd International Workshop on Recommendation Systems for Software Engineering',
  'doi': '10.1145/1808920.1808928',
  'isbn': '978-1-60558-974-9',
  'keywords': 'global software development, global software teams, recommendation systems, software product line',
  'location': 'Cape Town, South Africa',
  'numpages': '5',
  'pages': '36--40',
  'publisher': 'ACM',
  'series': "RSSE '10",
  'title': 'A Recommendation Framework for Allocating Global Software Teams in Software Product Line Projects',
  'url': 'http://doi.acm.org/10.1145/1808920.1808928',
  'year': '2010'}]

In [5]:
article_data = {}

article_data['id'] = article_id
article_data['url'] = url
article_data['title'] = bibtex_dict[0].get('title', None)
article_data['doi'] = bibtex_dict[0].get('doi', None) 
article_data['year'] = bibtex_dict[0].get('year', None)

article_data, bibtex_dict

({'doi': '10.1145/1808920.1808928',
  'id': '1808928',
  'title': 'A Recommendation Framework for Allocating Global Software Teams in Software Product Line Projects',
  'url': 'https://dl.acm.org/citation.cfm?id=1808928&preflayout=flat',
  'year': '2010'},
 [{'ENTRYTYPE': 'inproceedings',
   'ID': 'Pereira:2010:RFA:1808920.1808928',
   'acmid': '1808928',
   'address': 'New York, NY, USA',
   'author': "Pereira, Tha\\'{\\i}s Alves Burity and dos Santos, Vinicius Souza and Ribeiro, Bruno Luna and Elias, Gl\\^{e}dson",
   'booktitle': 'Proceedings of the 2Nd International Workshop on Recommendation Systems for Software Engineering',
   'doi': '10.1145/1808920.1808928',
   'isbn': '978-1-60558-974-9',
   'keywords': 'global software development, global software teams, recommendation systems, software product line',
   'location': 'Cape Town, South Africa',
   'numpages': '5',
   'pages': '36--40',
   'publisher': 'ACM',
   'series': "RSSE '10",
   'title': 'A Recommendation Framework for 

In [6]:
divmain = soup.find('div', id='divmain')

authors_tags = divmain.find_all('a', href=re.compile('author_page.cfm\?id=*'))
authors_and_affiliations = []

affiliation_tags = divmain.find_all('a', href=re.compile('inst_page.cfm\?id=*'))
affiliations = []

for author, affiliation in zip(authors_tags, affiliation_tags):
    authors_and_affiliations.append({'name': re.sub('[\'\']', '', repr(author.text.strip())),
                    'url': domain + author['href'],
                    'affiliation': {'name': re.sub('[\'\']', '', repr(affiliation.text.strip())),
                                    'url': domain + affiliation['href']}})

article_data['authors_and_affiliations'] = authors_and_affiliations
article_data

{'authors_and_affiliations': [{'affiliation': {'name': 'Federal University of Paraíba, Brazil',
    'url': 'https://dl.acm.org/inst_page.cfm?id=60011324'},
   'name': 'Thaís Alves Burity Pereira',
   'url': 'https://dl.acm.org/author_page.cfm?id=81440619424&coll=DL&dl=ACM&trk=0'},
  {'affiliation': {'name': 'Federal University of Paraíba, Brazil',
    'url': 'https://dl.acm.org/inst_page.cfm?id=60011324'},
   'name': 'Vinicius Souza dos Santos',
   'url': 'https://dl.acm.org/author_page.cfm?id=81464645284&coll=DL&dl=ACM&trk=0'},
  {'affiliation': {'name': 'Federal University of Paraíba, Brazil',
    'url': 'https://dl.acm.org/inst_page.cfm?id=60011324'},
   'name': 'Bruno Luna Ribeiro',
   'url': 'https://dl.acm.org/author_page.cfm?id=81319500397&coll=DL&dl=ACM&trk=0'},
  {'affiliation': {'name': 'Federal University of Paraíba, Brazil',
    'url': 'https://dl.acm.org/inst_page.cfm?id=60011324'},
   'name': 'Glêdson Elias',
   'url': 'https://dl.acm.org/author_page.cfm?id=81100533001&co

In [7]:
layout = soup.find('div', {'class': 'layout'})
flatbody = layout.find('div', {'class': 'flatbody'})

abstract = flatbody.text.strip()
article_data['abstract'] = abstract

article_data

{'abstract': 'In order to improve software quality and reduce costs and deadlines, many companies are adopting Software Product Line approaches. As a consequence of globalization, another common practice is the adoption of Global Software Development approaches, which seek to find more qualified workforce and more attractive costs in companies distributed around the world. Taking into account the benefits of both approaches, the ramework proposed in this paper has the goal of aiding the management of global software teams involved in the implementation phase of an SPL project, providing recommendations on how to allocate the teams to the set of software components, which are initially specified in the SPL architecture and must be subsequently implemented.',
 'authors_and_affiliations': [{'affiliation': {'name': 'Federal University of Paraíba, Brazil',
    'url': 'https://dl.acm.org/inst_page.cfm?id=60011324'},
   'name': 'Thaís Alves Burity Pereira',
   'url': 'https://dl.acm.org/autho

In [8]:
td = soup.find('td', string='Conference')

if td:
    td = td.nextSibling.nextSibling
    venue = td.strong.text.strip()
    url_conference = td.a['href']
else:
    venue = None
    url_conference = 'None'

article_data['venue'] = {'name': venue,
                        'url': domain + url_conference}

article_data

{'abstract': 'In order to improve software quality and reduce costs and deadlines, many companies are adopting Software Product Line approaches. As a consequence of globalization, another common practice is the adoption of Global Software Development approaches, which seek to find more qualified workforce and more attractive costs in companies distributed around the world. Taking into account the benefits of both approaches, the ramework proposed in this paper has the goal of aiding the management of global software teams involved in the implementation phase of an SPL project, providing recommendations on how to allocate the teams to the set of software components, which are initially specified in the SPL architecture and must be subsequently implemented.',
 'authors_and_affiliations': [{'affiliation': {'name': 'Federal University of Paraíba, Brazil',
    'url': 'https://dl.acm.org/inst_page.cfm?id=60011324'},
   'name': 'Thaís Alves Burity Pereira',
   'url': 'https://dl.acm.org/autho

In [9]:
flatbody = soup.find_all('div', {'class': 'flatbody'})
cited_by = []
for a in flatbody[3].find_all('a'):
    cited_by.append(a['href'].split('=')[-1])
    
article_data['cited_by'] = cited_by

article_data

{'abstract': 'In order to improve software quality and reduce costs and deadlines, many companies are adopting Software Product Line approaches. As a consequence of globalization, another common practice is the adoption of Global Software Development approaches, which seek to find more qualified workforce and more attractive costs in companies distributed around the world. Taking into account the benefits of both approaches, the ramework proposed in this paper has the goal of aiding the management of global software teams involved in the implementation phase of an SPL project, providing recommendations on how to allocate the teams to the set of software components, which are initially specified in the SPL architecture and must be subsequently implemented.',
 'authors_and_affiliations': [{'affiliation': {'name': 'Federal University of Paraíba, Brazil',
    'url': 'https://dl.acm.org/inst_page.cfm?id=60011324'},
   'name': 'Thaís Alves Burity Pereira',
   'url': 'https://dl.acm.org/autho