Skip to content

Commit

Permalink
LinkedIn: Support download without login (ytdl-org#21860)) and subtit…
Browse files Browse the repository at this point in the history
  • Loading branch information
Erez Volk committed Dec 24, 2019
1 parent 2dbc096 commit 04a93ac
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 5 deletions.
13 changes: 13 additions & 0 deletions test/test_subtitles.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
RTVEALaCartaIE,
FunnyOrDieIE,
DemocracynowIE,
LinkedInLearningIE,
)


Expand Down Expand Up @@ -219,6 +220,18 @@ def test_allsubtitles(self):
self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7')


class TestLinkedInSubtitles(BaseTestSubtitles):
url = 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true'
IE = LinkedInLearningIE

def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['en']))
self.assertEqual(md5(subtitles['en']), 'b329730e94e7fbdbac0307b3cad1221a')


class TestNPOSubtitles(BaseTestSubtitles):
url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860'
IE = NPOIE
Expand Down
53 changes: 48 additions & 5 deletions youtube_dl/extractor/linkedin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ExtractorError,
float_or_none,
int_or_none,
srt_subtitles_timecode,
urlencode_postdata,
urljoin,
)
Expand All @@ -31,10 +32,16 @@ def _call_api(self, course_slug, fields, video_slug=None, resolution=None):
})
sub = ' %dp' % resolution
api_url = 'https://www.linkedin.com/learning-api/detailedCourses'
cookies = self._get_cookies(api_url)

headers = {}
if 'JSESSIONID' in cookies:
headers['Csrf-Token'] = cookies['JSESSIONID'].value

return self._download_json(
api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={
'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
}, query=query)['elements'][0]
api_url, video_slug, 'Downloading%s JSON metadata' % sub,
headers=headers,
query=query)['elements'][0]

def _get_urn_id(self, video_data):
urn = video_data.get('urn')
Expand All @@ -47,12 +54,14 @@ def _get_video_id(self, video_data, course_slug, video_slug):
return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)

def _real_initialize(self):
# We need the JSESSIONID from the login page, even if we're not logging in
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')

email, password = self._get_login_info()
if email is None:
return

login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
action_url = urljoin(self._LOGIN_URL, self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url',
default='https://www.linkedin.com/uas/login-submit', group='url'))
Expand Down Expand Up @@ -126,15 +135,49 @@ def _real_extract(self, url):

self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))

subtitles = self.extract_subtitles(video_data)

return {
'id': self._get_video_id(video_data, course_slug, video_slug),
'title': title,
'formats': formats,
'thumbnail': video_data.get('defaultThumbnail'),
'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
'duration': int_or_none(video_data.get('durationInSeconds')),
'subtitles': subtitles,
}

def _get_subtitles(self, video_data):
transcript = video_data.get('transcript')
if not transcript:
return {}
lines = transcript.get('lines')
if not lines:
return {}
fixed_subs = self._fix_subtitles(lines)
if fixed_subs:
return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
return {}

def _fix_subtitles(self, lines):
srt = ''
seq_counter = 0
for pos in range(0, len(lines) - 1):
seq_current = lines[pos]
seq_next = lines[pos + 1]

appear_time = self._timecode(seq_current['transcriptStartAt'])
disappear_time = self._timecode(seq_next['transcriptStartAt'])
text = seq_current['caption'].strip()

if text:
seq_counter += 1
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
return srt

def _timecode(self, ms):
return srt_subtitles_timecode(ms / 1000.0)


class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
IE_NAME = 'linkedin:learning:course'
Expand Down

0 comments on commit 04a93ac

Please sign in to comment.