注意需要在本地安装FFmpeg

In [None]:
import torch
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from moviepy.editor import VideoFileClip
import collections.abc
import ffmpeg
import librosa
import soundfile as sf

import os
import requests

In [2]:
def extract_flac_from_mp4(mp4_file, flac_file):
    """
    Extracts audio from an MP4 file and saves it as FLAC.

    Args:
        mp4_file: Path to the MP4 file.
        flac_file: Path to the output FLAC file.
    """
    try:
        input_stream = ffmpeg.input(mp4_file)
        print(f'Input {mp4_file} successfully')
        
        audio_stream = input_stream.audio
        print(f'Get audio from {mp4_file} successfully')
        
        output_stream = ffmpeg.output(audio_stream, flac_file, acodec='flac')
        print('Convert audio into flac successfully')        
        
        ffmpeg.run(output_stream)
        print(f"Successfully extracted audio to {flac_file}")
    except ffmpeg.Error as e:
        print(f"Error extracting audio: {e}")
    
def load_audio_from_flac(file_path):
    """Loads audio data from a FLAC file.

    Args:
    file_path (str): Path to the FLAC file.

    Returns:
    tuple: A tuple containing the audio data as a NumPy array and the sample rate.
    """
    sample_rate_whisper = 16000
    
    try:
        audio_data, sample_rate = librosa.load(file_path, sr=sample_rate_whisper)
        return audio_data, sample_rate
    except Exception as e:
        print(f"Error loading audio: {e}")
        return None, None
    
# Define a function to process audio in chunks
def transcribe_long_audio(model, audio_array, sampling_rate):
    chunk_length = 30  # Process in 30-second chunks
    num_chunks = int(len(audio_array) / (chunk_length * sampling_rate)) + 1
    transcriptions = []

    for i in range(num_chunks):
        start = i * chunk_length * sampling_rate
        end = min((i + 1) * chunk_length * sampling_rate, len(audio_array))
        chunk = audio_array[start:end]

        # Use the model and processor to transcribe the audio:
        input_features = processor(
            chunk,
            sampling_rate=sampling_rate,
            return_tensors="pt",
            return_attention_mask=True
        )

        # Generate token ids
        predicted_ids = model.generate(
            input_features=input_features.input_features,
            attention_mask=input_features.attention_mask
        )

        # Decode token ids to text
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Joint outputs of chunks
        transcriptions.extend(transcription)

    return " ".join(transcriptions)

In [3]:
video_file = ".\Data\President20Kennedy.mp4"
audio_file = ".\Data\President20Kennedy.flac"

if not os.path.exists(audio_file):
    extract_flac_from_mp4(video_file, audio_file)

In [None]:
audio_data, sample_rate = load_audio_from_flac(audio_file)

if audio_data is not None:
    print(f"Audio loaded successfully. Sample rate: {sample_rate} Hz")

In [5]:
# Load the Whisper model in Hugging Face format:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

In [None]:
# Use the model and processor to transcribe the audio:
input_features = processor(
    audio_data,
    sampling_rate=sample_rate,
    return_tensors="pt",
    return_attention_mask=True
)

# Generate token ids
predicted_ids = model.generate(
    input_features=input_features.input_features,
    attention_mask=input_features.attention_mask
)

# Decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

print(transcription)

In [None]:
transcription = transcribe_long_audio(model, audio_data, sample_rate)

print(transcription)

Connect to local LLM

In [32]:
base_url = 'http://localhost:8080'

def get_server_health():
    global base_url
    
    response = requests.get(f'{base_url}/health')
    return response.json()

def post_completion(context, user_input):
    global base_url
    
    prompt = f"{context}\nUser: {user_input}\nAssistant:"
    data = {
        'prompt': prompt,
        'temperature': 0.5,
        'top_k': 35,
        'top_p': 0.95,
        'n_predict': [],
        'stop': ["</s>", "Assistant:", "User:"]
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(f'{base_url}/completion', json=data, headers=headers)
    if response.status_code == 200:
        return response.json()['content'].strip()
    else:
        return "Error processing your request. Please try again."
    
def update_context(context, user_input, assistant_response):
    return f"{context}\nUser: {user_input}\nAssistant: {assistant_response}"

In [None]:
health = get_server_health()
print('Server Health:', health)

text = transcription
user_input = 'Please read the following article and provide a concise summary of the main points and key details of what the speaker talked. The outputs should include the main point and at least three detailed of the main points. The summary should be no longer than 200 words.\n'

print('User:', user_input)
assistant_response = post_completion(user_input, text)
print('Assistant:', assistant_response)

# if health.get('status') == 'ok':
#     while True:
#         user_input = input("Enter a prompt or type 'exit' to quit: ")
#         if user_input.lower() == 'exit':
#             break
        
#         print('User:', user_input)
#         assistant_response = post_completion(context, user_input)
#         print('Assistant:', assistant_response)

#         context = update_context(context, user_input, assistant_response)
# else:
#     print("Server is not ready for requests.")