# Download video and extract audio from video file

In [1]:
#pip install pytube 
#pip install moviepy
#pip install transformers datasets torchaudio
#pip install soundfile

In [3]:
import time
import pytube
import moviepy.editor as mp
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [4]:
# input youtube video file
################################################################
# supervised link: Morning News
# url = 'https://www.youtube.com/watch?v=ysLiABvVos8'

# youtube short
url = 'https://www.youtube.com/shorts/U0NPhFkt53Q'

# testing link: Speaker on AI Health and Education
# url = 'https://www.youtube.com/watch?v=Sby1uJ_NFIY'

# Download

In [5]:
# creating a youtube object
yt = pytube.YouTube(url)
# filtering and selecting video
filtered_yt = yt.streams.filter(only_audio=True).first()

In [6]:
# downloading the video file
start_time = time.time()
filtered_yt.download(filename=r'..\data\youtube_short.mp4')
end_time = time.time()
download_time = end_time - start_time
print(f"Download took {download_time:.3f} seconds")

Download took 0.283 seconds


# Extract

In [7]:
# audio extraction
clip = mp.AudioFileClip(filename=r'..\data\youtube_short.mp4')
clip.write_audiofile(r'..\data\morning_news.wav')

MoviePy - Writing audio in ..\data\morning_news.wav


                                                                      

MoviePy - Done.




# Transcription

In [8]:
# loading pretrained model and tokenizer
tokeniztor = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec

In [9]:
# load audio file
audio_input, sample_rate = torchaudio.load(r'..\data\morning_news.wav')

In [10]:
# preprocessing audio
input_values = tokeniztor(audio_input.squeeze().numpy(), return_tensors='pt').input_values

In [11]:
# setting the chunk size to 15 seconds in samples
chunk_size = 15 * sample_rate

In [12]:
# Split audio into chunks
chunks = [audio_input[:, i:i + chunk_size] for i in range(0, audio_input.size(1), chunk_size)]

In [13]:
# Process each chunk
transcriptions = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")
    input_values = tokeniztor(chunk.squeeze().numpy(), return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = logits.argmax(dim=-1)
    transcription = tokeniztor.decode(predicted_ids[0])
    transcriptions.append(transcription)

Processing chunk 1/4
Processing chunk 2/4
Processing chunk 3/4
Processing chunk 4/4


In [15]:
# Combine all transcriptions
full_transcription = " ".join(transcriptions)
print("Full Transcription:")
print(full_transcription)

Full Transcription:
A PARTU PAPAR T VE PA OTR PAR FOLPA FAYE   BASHIN RUNSHIN CA  MA CHISH BAN FAMCH A PASHTER BOSAPOUSHAL POSH O BA SHOED FL OAB WAH   WAH FORAN UL OR ASHA PAN LORID BID ALUSH E ANSHPUT LS TOA BETRBO WEA NSAMBE TRISTAAFAAN HUMN AU TOSH AN EFOMLUTME TOR HOMISHO SHORA OALL ASHHUM UTM FUR AL AS WOSH WA ER  ULIA AUSASTO BA ATOANASHTASHAN TERANPRUS MAANHPOHTA HUMBPESNATORAU U E FU O LE TUR O FINE HURTONPONNYA TFANISE SHON AS AHANSOHOL A FARNABOOL PI MIISH PRATRAGAN TALL BA BAA H SO FORID HAR FOOSHINTOLERABLE O SHA TOA FOLLASH POR TA O TERNO FE LETA T LANSH  WAE ESH OL IS PSH A TANTLTEHN HA NOTECOTATY TEOAPANNEETA TSH PANM UAS FOMSH ARUHART O HOTUA FAR PIOLD WALSHO RESH O OLSAN A TE WAHAME AR APANE AN ANTILL
