Merge pull request #795 from sharkykh/fix-lxml

Fix lxml parsing for Addic7ed
Diaoul · Nov 18, 2017 · ccc0013 · ccc0013
2 parents 53ddf7f + 5720309
commit ccc0013
Showing 1 changed file with 13 additions and 1 deletion.
diff --git a/subliminal/providers/addic7ed.py b/subliminal/providers/addic7ed.py
@@ -19,6 +19,9 @@
 
 language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
 
+# Series cell matching regex
+show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)
+
 #: Series header parsing regex
 series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),&!?-]+?)(?: \((?P<year>\d{4})\))?$')
 
@@ -137,7 +140,16 @@ def _get_show_ids(self):
         logger.info('Getting show ids')
         r = self.session.get(self.server_url + 'shows.php', timeout=10)
         r.raise_for_status()
-        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
+
+        # LXML parser seems to fail when parsing Addic7ed.com HTML markup.
+        # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
+        # Assuming the site's markup is bad, and stripping it down to only contain what's needed.
+        show_cells = re.findall(show_cells_re, r.content)
+        if show_cells:
+            soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
+        else:
+            # If RegEx fails, fall back to original r.content and use 'html.parser'
+            soup = ParserBeautifulSoup(r.content, ['html.parser'])
 
         # populate the show ids
         show_ids = {}