In [1]:
from datetime import datetime as dt
from youtube_transcript_api import YouTubeTranscriptApi as youtube

from khan_helpers.constants import RAW_DIR

Experiment & Participant classes, helper functions, and variables used across multiple notebooks can be found in `/mnt/code/khan_helpers/khan_helpers`, or on GitHub, [here](https://github.com/contextlab/efficient-learning-khan/tree/master/code/khan_helpers).<br />You can also view source code directly from the notebook with:<br /><pre>    from khan_helpers.functions import show_source<br />    show_source(foo)</pre>

In [2]:
def transcript_to_str(transcript_data):
    transcript_lines = []
    for chunk in transcript_data:
        timestamp = chunk['start']
        ts_str = dt.fromtimestamp(timestamp).strftime("%M:%S.%f").rstrip('0')
        text = chunk['text'].replace('\n', ' ')
        
        transcript_lines.append(ts_str)
        transcript_lines.append(text)
        
    # timestamp resolution is ms, rounding just deals with floating point errors
    end_time = round(chunk['start'] + chunk['duration'], 3)
    end_time_str = dt.fromtimestamp(end_time).strftime("%M:%S.%f").rstrip('0')
    transcript_lines.append(end_time_str)
    return '\n'.join(transcript_lines)

In [3]:
forces_video_id = 'FEF6PxWOvsk'
bos_video_id = 'i-NNWI8Ccas'

In [4]:
forces_transcript = transcript_to_str(youtube.get_transcript(forces_video_id))
bos_transcript = transcript_to_str(youtube.get_transcript(bos_video_id))

In [5]:
forces_transcript.splitlines()[:8]

['00:00.294',
 'What I want to do in this video is',
 '00:01.71',
 'give a very high-level overview of the four fundamental forces',
 '00:08.16',
 'of the universe.',
 '00:09.28',
 "And I'm going to start with gravity."]

In [6]:
bos_transcript.splitlines()[:8]

['00:00.52',
 "Let's imagine we have a huge cloud",
 '00:02.65',
 'of hydrogen atoms floating in space.',
 '00:05.47',
 'Huge, and when I say huge cloud, huge both in distance',
 '00:08.24',
 'and in mass.']

In [7]:
RAW_DIR.joinpath('forces_transcript_timestamped.txt').write_text(forces_transcript)
RAW_DIR.joinpath('bos_transcript_timestamped.txt').write_text(bos_transcript)

9074