Skip to content

Commit

Permalink
Merge pull request #795 from sharkykh/fix-lxml
Browse files Browse the repository at this point in the history
Fix lxml parsing for Addic7ed
  • Loading branch information
fernandog committed Nov 18, 2017
2 parents 53ddf7f + 5720309 commit ccc0013
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion subliminal/providers/addic7ed.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')

# Series cell matching regex
show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)

#: Series header parsing regex
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),&!?-]+?)(?: \((?P<year>\d{4})\))?$')

Expand Down Expand Up @@ -137,7 +140,16 @@ def _get_show_ids(self):
logger.info('Getting show ids')
r = self.session.get(self.server_url + 'shows.php', timeout=10)
r.raise_for_status()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
show_cells = re.findall(show_cells_re, r.content)
if show_cells:
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
else:
# If RegEx fails, fall back to original r.content and use 'html.parser'
soup = ParserBeautifulSoup(r.content, ['html.parser'])

# populate the show ids
show_ids = {}
Expand Down

0 comments on commit ccc0013

Please sign in to comment.