LinkedIn: Support download without login (ytdl-org#21860)) and subtit…

…les (ytdl-org#21879)
ErezVolk · Dec 24, 2019 · 04a93ac · 04a93ac
1 parent 2dbc096
commit 04a93ac
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 5 deletions.
diff --git a/test/test_subtitles.py b/test/test_subtitles.py
@@ -28,6 +28,7 @@
     RTVEALaCartaIE,
     FunnyOrDieIE,
     DemocracynowIE,
+    LinkedInLearningIE,
 )
 
 
@@ -219,6 +220,18 @@ def test_allsubtitles(self):
         self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7')
 
 
+class TestLinkedInSubtitles(BaseTestSubtitles):
+    url = 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true'
+    IE = LinkedInLearningIE
+
+    def test_allsubtitles(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['allsubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(set(subtitles.keys()), set(['en']))
+        self.assertEqual(md5(subtitles['en']), 'b329730e94e7fbdbac0307b3cad1221a')
+
+
 class TestNPOSubtitles(BaseTestSubtitles):
     url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860'
     IE = NPOIE

diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py
@@ -8,6 +8,7 @@
     ExtractorError,
     float_or_none,
     int_or_none,
+    srt_subtitles_timecode,
     urlencode_postdata,
     urljoin,
 )
@@ -31,10 +32,16 @@ def _call_api(self, course_slug, fields, video_slug=None, resolution=None):
             })
             sub = ' %dp' % resolution
         api_url = 'https://www.linkedin.com/learning-api/detailedCourses'
+        cookies = self._get_cookies(api_url)
+
+        headers = {}
+        if 'JSESSIONID' in cookies:
+            headers['Csrf-Token'] = cookies['JSESSIONID'].value
+
         return self._download_json(
-            api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={
-                'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
-            }, query=query)['elements'][0]
+            api_url, video_slug, 'Downloading%s JSON metadata' % sub,
+            headers=headers,
+            query=query)['elements'][0]
 
     def _get_urn_id(self, video_data):
         urn = video_data.get('urn')
@@ -47,12 +54,14 @@ def _get_video_id(self, video_data, course_slug, video_slug):
         return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)
 
     def _real_initialize(self):
+        # We need the JSESSIONID from the login page, even if we're not logging in
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
         email, password = self._get_login_info()
         if email is None:
             return
 
-        login_page = self._download_webpage(
-            self._LOGIN_URL, None, 'Downloading login page')
         action_url = urljoin(self._LOGIN_URL, self._search_regex(
             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url',
             default='https://www.linkedin.com/uas/login-submit', group='url'))
@@ -126,15 +135,49 @@ def _real_extract(self, url):
 
         self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))
 
+        subtitles = self.extract_subtitles(video_data)
+
         return {
             'id': self._get_video_id(video_data, course_slug, video_slug),
             'title': title,
             'formats': formats,
             'thumbnail': video_data.get('defaultThumbnail'),
             'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
             'duration': int_or_none(video_data.get('durationInSeconds')),
+            'subtitles': subtitles,
         }
 
+    def _get_subtitles(self, video_data):
+        transcript = video_data.get('transcript')
+        if not transcript:
+            return {}
+        lines = transcript.get('lines')
+        if not lines:
+            return {}
+        fixed_subs = self._fix_subtitles(lines)
+        if fixed_subs:
+            return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
+        return {}
+
+    def _fix_subtitles(self, lines):
+        srt = ''
+        seq_counter = 0
+        for pos in range(0, len(lines) - 1):
+            seq_current = lines[pos]
+            seq_next = lines[pos + 1]
+
+            appear_time = self._timecode(seq_current['transcriptStartAt'])
+            disappear_time = self._timecode(seq_next['transcriptStartAt'])
+            text = seq_current['caption'].strip()
+
+            if text:
+                seq_counter += 1
+                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
+        return srt
+
+    def _timecode(self, ms):
+        return srt_subtitles_timecode(ms / 1000.0)
+
 
 class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
     IE_NAME = 'linkedin:learning:course'