In [1]:
import sys
import warnings
import whisper
from pathlib import Path
import subprocess
import torch
import numpy as np
from IPython.display import display, Markdown


In [2]:
# Set up the device for running the model
device = torch.device('cuda:0')
print('Using device:', device, file=sys.stderr)

# Model selection
Model = 'medium'  # Choose from available models ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large']
whisper_model = whisper.load_model(Model)

# Verify if the chosen model is available
if Model not in whisper.available_models():
    raise ValueError(f"{Model} model is not available. Choose from: {', '.join(whisper.available_models())}")


Using device: cuda:0
  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Path to the local video file
video_path = './input_videos/538_1700534265.mp4'

# Convert the video to audio using FFmpeg
audio_path = Path(video_path).with_suffix('.wav')
subprocess.run(["ffmpeg", "-i", str(video_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(audio_path)])

# Transcription settings
language = "Chinese" #"English"  # Language for transcription
verbose = 'Live transcription'  # Verbose mode
output_format = 'srt'  # Output format for the transcript
task = 'transcribe'  # Task type

# Transcription arguments
args = {
    'language': language,
    'verbose': verbose == 'Live transcription',
    'task': task,
    # Add other parameters as required
}

# Perform transcription
video_transcription = whisper.transcribe(
    whisper_model,
    str(audio_path),
    **args,
)

# # Save the transcription output
# whisper.utils.get_writer(
#     output_format=output_format,
#     output_dir=audio_path.parent
# )(
#     video_transcription,
#     str(audio_path.stem),
#     {
#         "max_line_width": 47, 
#         "max_line_count": 1,
#         "highlight_words": False
#     }
# )

# print("Transcription complete. Transcript saved in:", audio_path.parent)


ffmpeg version 3.4.11-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-li

[00:00.000 --> 00:03.000] 大家早上好
[00:03.000 --> 00:07.000] 今天我给大家分享的内容是想给大家介绍一下
[00:07.000 --> 00:13.000] 什么样的紫砂壶去配什么样的茶
[00:13.000 --> 00:19.000] 也就是说如何选择紫砂壶与茶来进行搭配
[00:19.000 --> 00:23.000] 那么我们知道紫砂壶的壶型是很多的
[00:23.000 --> 00:27.000] 有这样的稍微身比较高的
[00:27.000 --> 00:31.000] 然后也有像这样的壶身比较小的
[00:31.000 --> 00:37.000] 还有像这样的壶身比较扁的
[00:37.000 --> 00:40.000] 它的泥料也有各种不同
[00:40.000 --> 00:46.000] 那么还有这样的壶型
[00:46.000 --> 00:49.000] 那么我们在选择紫砂壶的时候
[00:49.000 --> 00:51.000] 我们一般的情况下
[00:51.000 --> 00:53.000] 首先要看我们泡什么茶
[00:53.000 --> 00:58.000] 首先如果说我们泡的茶是单从
[00:58.000 --> 01:00.000] 那么我们一般的时候
[01:00.000 --> 01:03.000] 首先要选择的是这样一个非常小的壶型
[01:03.000 --> 01:04.000] 那因为什么呢
[01:04.000 --> 01:08.000] 因为我们单从对水温要求非常高
[01:08.000 --> 01:11.000] 而且它还要求快速的平隐
[01:11.000 --> 01:14.000] 尤其是前六泡要快速的平隐
[01:14.000 --> 01:15.000] 然后呢
[01:15.000 --> 01:18.000] 所以茶要分出来茶汤马上喝掉
[01:18.000 --> 01:21.000] 然后你才能保证再碰冲泡第二泡
[01:21.000 --> 01:23.000] 所以我们一般的情况下
[01:23.000 --> 01:26.000] 会选择这样一个小的紫砂壶
[01:26.000 --> 01:28.000] 然后我们在

In [4]:
# Perform transcription
# video_transcription = whisper.transcribe(
#     whisper_model,
#     str(audio_path),
#     **args,
# )

# Extract only the text from the transcription result
transcript_text = '\n'.join(segment['text'] for segment in video_transcription['segments'])

# Define the path for the text-only transcript file
transcript_text_path = audio_path.parent / f"{audio_path.stem}_transcript.txt"

# Save the text-only transcript
with open(transcript_text_path, 'w') as file:
    file.write(transcript_text)

print("Transcription complete. Text-only transcript saved in:", transcript_text_path)


Transcription complete. Text-only transcript saved in: input_videos/538_1700534265_transcript.txt
