In [1]:
# Cell 1
import torch
from torchvision import transforms
from PIL import Image
import numpy as np
from model import DSN
import torch.nn as nn
import cv2
import time
import os


In [3]:
# Function to extract frames at a specified frame rate and append paths to a list
def extract_frames(video_path, output_folder, frame_rate=2):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video file.")
        return

    frame_width = int(cap.get(3))  # Get the width of the frames
    frame_height = int(cap.get(4))  # Get the height of the frames

    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can change the codec as needed
    output_path = os.path.join(output_folder, "output_video.mp4")
    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (frame_width, frame_height))

    start_time = time.time()
    frame_count = 0
    frames = []  # List to store frame paths

    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break

        elapsed_time = time.time() - start_time
        if elapsed_time >= 1.0 / frame_rate:
            out.write(frame)
            frame_count += 1
            start_time = time.time()

            # Save the frame as an image file
            frame_filename = f"frame_{frame_count:04d}.png"
            frame_path = os.path.join(output_folder, frame_filename)
            cv2.imwrite(frame_path, frame)
            frames.append(frame_path)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    print(f"Frames extracted: {frame_count}")
    print(f"Frames per second: {frame_rate}")
    print(f"Output video saved to: {output_path}")

    return frames




In [4]:
# Example usage
video_path = "./video/IronMan.mp4"
output_folder = "./frames"
frames = extract_frames(video_path, output_folder, frame_rate=2)

# Now 'extracted_frame_paths' contains a list of file paths for the extracted frames
print("Extracted frame paths:", frames)


Frames extracted: 121
Frames per second: 2
Output video saved to: ./frames\output_video.mp4
Extracted frame paths: ['./frames\\frame_0001.png', './frames\\frame_0002.png', './frames\\frame_0003.png', './frames\\frame_0004.png', './frames\\frame_0005.png', './frames\\frame_0006.png', './frames\\frame_0007.png', './frames\\frame_0008.png', './frames\\frame_0009.png', './frames\\frame_0010.png', './frames\\frame_0011.png', './frames\\frame_0012.png', './frames\\frame_0013.png', './frames\\frame_0014.png', './frames\\frame_0015.png', './frames\\frame_0016.png', './frames\\frame_0017.png', './frames\\frame_0018.png', './frames\\frame_0019.png', './frames\\frame_0020.png', './frames\\frame_0021.png', './frames\\frame_0022.png', './frames\\frame_0023.png', './frames\\frame_0024.png', './frames\\frame_0025.png', './frames\\frame_0026.png', './frames\\frame_0027.png', './frames\\frame_0028.png', './frames\\frame_0029.png', './frames\\frame_0030.png', './frames\\frame_0031.png', './frames\\frame

In [5]:
# Cell 2
def _get_features(frames, gpu=True, batch_size=1):
    # Load pre-trained GoogLeNet model
    googlenet = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', weights='GoogLeNet_Weights.DEFAULT')

    # Remove the classification layer (last layer) to obtain features
    googlenet = torch.nn.Sequential(*(list(googlenet.children())[:-1]))

    # Set the model to evaluation mode
    googlenet.eval()

    # Initialize a list to store the features
    features = []

    # Image preprocessing pipeline
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Iterate through frames
    for frame_path in frames:
        # Load and preprocess the frame
        input_image = Image.open(frame_path)
        input_tensor = preprocess(input_image)
        input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

        # Move the input and model to GPU if available
        if gpu:
            input_batch = input_batch.to('cuda')
            googlenet.to('cuda')

        # Perform feature extraction
        with torch.no_grad():
            output = googlenet(input_batch)

        # Append the features to the list
        features.append(output.squeeze().cpu().numpy())

    # Convert the list of features to a NumPy array
    features = np.array(features)

    return features.astype(np.float32)



In [6]:
# Cell 3
def _get_probs(features, gpu=True, mode=0):
    model_cache_key = "keyframes_rl_model_cache_" + str(mode)

    if mode == 1:
        model_path = "pretrained_model/model_1.pth.tar"
    else:
        model_path = "pretrained_model/model_0.pth.tar"
    model = DSN(in_dim=1024, hid_dim=256, num_layers=1, cell="lstm")
    if gpu:
        checkpoint = torch.load(model_path)
    else:
        checkpoint = torch.load(model_path, map_location='cpu')
    model.load_state_dict(checkpoint)
    if gpu:
        model = nn.DataParallel(model).cuda()
    model.eval()

    seq = torch.from_numpy(features).unsqueeze(0)
    if gpu: seq = seq.cuda()
    probs = model(seq)
    probs = probs.data.cpu().squeeze().numpy()
    return probs




In [7]:
print(_get_features(frames))
print(_get_probs(_get_features(frames)))

features = _get_features(frames)
print(features.shape)
print(features[0].shape)
print(_get_probs(features).shape)

Using cache found in C:\Users\Reuben/.cache\torch\hub\pytorch_vision_v0.10.0


[[0.00076084 0.02302736 0.01229232 ... 0.         0.28099012 0.22683515]
 [0.00292103 0.03626517 0.07445761 ... 0.02032194 0.2840359  0.4727426 ]
 [0.01197546 0.03877074 0.0361792  ... 0.02937937 0.1893131  0.01219838]
 ...
 [0.29286134 0.1444928  0.6816671  ... 0.21294546 0.45816123 0.10849144]
 [0.20330043 0.24632838 0.7150742  ... 0.10341235 0.38244024 0.08002178]
 [0.42814285 0.05499694 0.34291217 ... 0.09384909 0.2126687  0.01221014]]


Using cache found in C:\Users\Reuben/.cache\torch\hub\pytorch_vision_v0.10.0


[0.92428774 0.9405393  0.9403023  0.947059   0.95628273 0.9597538
 0.9563509  0.9603559  0.96274513 0.95856637 0.95754725 0.95754963
 0.9563159  0.9566882  0.9583453  0.9625568  0.9575483  0.95946056
 0.9624249  0.96213436 0.9647548  0.9651764  0.96348166 0.95760465
 0.9573023  0.9483761  0.9586274  0.96309924 0.9626103  0.9611359
 0.9619318  0.96068597 0.9581939  0.9624936  0.96567476 0.9605381
 0.96248174 0.9563921  0.95681167 0.95889807 0.96175766 0.960153
 0.96098644 0.9651984  0.9620243  0.96110964 0.96562564 0.9664658
 0.96179694 0.96532124 0.96164066 0.95916677 0.95536464 0.95793706
 0.9554321  0.957518   0.9589432  0.9617849  0.9616793  0.96390814
 0.96845955 0.96928865 0.96415085 0.9699055  0.9697425  0.9693654
 0.968953   0.9685338  0.97116166 0.9659987  0.9653397  0.9650486
 0.9665922  0.96858764 0.9641196  0.9665096  0.9669485  0.9676694
 0.9621605  0.9650141  0.9621402  0.9645135  0.96676564 0.9643471
 0.961055   0.9524622  0.950604   0.95916736 0.9645953  0.96048313
 0.95

Using cache found in C:\Users\Reuben/.cache\torch\hub\pytorch_vision_v0.10.0


(121, 1024)
(1024,)
(121,)


In [9]:
# @title 🌴 Change the values in this section

# @markdown Select the source of the audio/video file to be transcribed
input_format = "local" #@param ["youtube", "gdrive", "local"]

# @markdown Enter the URL of the YouTube video or the path of the audio file to be transcribed
file =video_path #@param {type:"string"}

#@markdown Click here if you'd like to save the transcription as text file
plain = True #@param {type:"boolean"}

# @markdown Click here if you'd like to save the transcription as an SRT file
srt = True #@param {type:"boolean"}

#@markdown Click here if you'd like to save the transcription as a VTT file
vtt = True #@param {type:"boolean"}

#@markdown Click here if you'd like to save the transcription as a TSV file
tsv = True #@param {type:"boolean"}

#@markdown Click here if you'd like to download the transcribed file(s) locally
download = True #@param {type:"boolean"}

In [10]:
import os, re
import torch
from pathlib import Path

import whisper
from whisper.utils import get_writer

In [11]:
# Use CUDA, if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load the desired model
model = whisper.load_model("medium.en").to(DEVICE)  

100%|█████████████████████████████████████| 1.42G/1.42G [03:30<00:00, 7.26MiB/s]


In [12]:
def to_snake_case(name):
    return name.lower().replace(" ", "_").replace(":", "_").replace("__", "_")

def download_youtube_audio(url,  file_name = None, out_dir = "."):
    "Download the audio from a YouTube video"
    yt = YouTube(url)
    if file_name is None:
        file_name = Path(out_dir, to_snake_case(yt.title)).with_suffix(".mp4")
    yt = (yt.streams
            .filter(only_audio = True, file_extension = "mp4")
            .order_by("abr")
            .desc())
    return yt.first().download(filename = file_name)

In [13]:
def transcribe_file(model, file, plain, srt, vtt, tsv, download):
    """
    Runs Whisper on an audio file

    Parameters
    ----------
    model: Whisper
        The Whisper model instance.

    file: str
        The file path of the file to be transcribed.

    plain: bool
        Whether to save the transcription as a text file or not.

    srt: bool
        Whether to save the transcription as an SRT file or not.

    vtt: bool
        Whether to save the transcription as a VTT file or not.

    tsv: bool
        Whether to save the transcription as a TSV file or not.

    download: bool
        Whether to download the transcribed file(s) or not.

    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    """
    file_path = Path(file)
    print(f"Transcribing file: {file_path}\n")

    output_directory = file_path.parent

    # Run Whisper
    result = model.transcribe(file, verbose = False, language = "en")

    if plain:
        txt_path = file_path.with_suffix(".txt")
        print(f"\nCreating text file")

        with open(txt_path, "w", encoding="utf-8") as txt:
            txt.write(result["text"])
    if srt:
        print(f"\nCreating SRT file")
        srt_writer = get_writer("srt", output_directory)
        srt_writer(result, str(file_path.stem))

    if vtt:
        print(f"\nCreating VTT file")
        vtt_writer = get_writer("vtt", output_directory)
        vtt_writer(result, str(file_path.stem))

    if tsv:
        print(f"\nCreating TSV file")

        tsv_writer = get_writer("tsv", output_directory)
        tsv_writer(result, str(file_path.stem))

    if download:
        from google.colab import files

        colab_files = Path("/content")
        stem = file_path.stem

        for colab_file in colab_files.glob(f"{stem}*"):
            if colab_file.suffix in [".txt", ".srt", ".vtt", ".tsv"]:
                print(f"Downloading {colab_file}")
                files.download(str(colab_file))

    return result

In [14]:
if input_format == "youtube":
    # Download the audio stream of the YouTube video
    audio = download_youtube_audio(file)
    print(f"Downloading audio stream: {audio}")

    # Run Whisper on the audio stream
    result = transcribe_file(model, audio, plain, srt, vtt, tsv, download)
elif input_format == "gdrive":
    # Authorize a connection between Google Drive and Google Colab
    from google.colab import drive
    drive.mount('/content/drive')

    # Run Whisper on the specified file
    result = transcribe_file(model, file, plain, srt, vtt, tsv, download)
elif input_format == "local":
    # Run Whisper on the specified file
    result = transcribe_file(model, file, plain, srt, vtt, tsv, download)

Transcribing file: video\IronMan.mp4



FileNotFoundError: [WinError 2] The system cannot find the file specified