In [None]:
import requests
import chardet
import os
import re
import whisper
import torch
from bs4 import BeautifulSoup
from pydub import AudioSegment
from urllib.parse import urlparse

In [None]:
#ApplePodcast
def extract_audio_url_from_apple(url):
    """
    Extract audio URL and soup from an Apple Podcast page.
    """
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch the webpage. Status code: {response.status_code}\nURL: {url}")

    # 自动检测编码
    detected_encoding = chardet.detect(response.content)['encoding']
    if detected_encoding:
        response.encoding = detected_encoding
    else:
        response.encoding = 'utf-8'

    soup = BeautifulSoup(response.text, 'html.parser')

    # 查找音频链接 (MP3 或 M4A)
    audio_urls = re.findall(r'https://[^\s^"]+(?:\.mp3|\.m4a)', response.text)
    if not audio_urls:
        raise Exception("No audio URLs found on the page.")

    return audio_urls[-1], soup  # 返回音频链接和 HTML 结构

#小宇宙
def extract_audio_url_from_xiaoyuzhou(url):
    """
    Extract audio URL and soup from a Xiaoyuzhou FM page.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # 查找音频链接 (og:audio)
    audio_meta = soup.find('meta', property='og:audio')
    if audio_meta and audio_meta.get('content'):
        return audio_meta['content'], soup  # 返回音频链接和 HTML 结构
    else:
        raise Exception("No audio URL found on the Xiaoyuzhou page.")


#判断函数
def extract_audio_url_by_platform(url):
    """
    Determine platform and extract audio URL and soup.
    """
    if "apple.com" in url:
        print("检测到 Apple Podcast 链接，开始提取...")
        return extract_audio_url_from_apple(url)
    elif "xiaoyuzhoufm.com" in url:
        print("检测到小宇宙链接，开始提取...")
        return extract_audio_url_from_xiaoyuzhou(url)
    else:
        raise Exception("无法识别的链接平台，请提供 Apple Podcast 或小宇宙平台的链接。")

def get_podcast_title(soup):
    """
    Extract the title of the podcast episode from the HTML soup.
    """
    title_tag = soup.find('title')
    if title_tag:
        title = title_tag.text.strip()
        # 清理多余的后缀
        title = re.sub(r" - .*Apple Podcasts.*", "", title)
        # 处理可能的编码问题
        title = title.encode('latin1').decode('utf-8') if 'charset=iso' in str(soup) else title
        return title
    else:
        return "unknown_podcast"


def format_filename(title):
    """
    Format the title to create a safe filename.
    """
    title = re.sub(r'[^\w\-_\. ]', '_', title)  # 替换特殊字符为 "_"
    return title


def download_audio_file(audio_url, title, output_folder="downloads"):
    """
    Download the audio file from a given URL and save it locally with the given title.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 格式化标题为文件名
    formatted_title = format_filename(title)
    file_extension = os.path.splitext(urlparse(audio_url).path)[1]
    output_path = os.path.join(output_folder, f"{formatted_title}{file_extension}")

    # Stream and save the audio file
    with requests.get(audio_url, stream=True) as response:
        response.raise_for_status()
        with open(output_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

    return output_path


def convert_audio_to_wav(input_file, title):
    """
    Convert audio file (MP3/M4A) to WAV format and save it with the given title.
    """
    formatted_title = format_filename(title)
    output_file = os.path.join(os.path.dirname(input_file), f"{formatted_title}.wav")
    
    if input_file.lower().endswith(".mp3"):
        audio = AudioSegment.from_mp3(input_file)
    elif input_file.lower().endswith(".m4a"):
        audio = AudioSegment.from_file(input_file, "m4a")
    else:
        raise Exception("Unsupported audio format. Only MP3 and M4A are supported.")
    
    audio.export(output_file, format="wav")
    return output_file




# 交互式部分：提示用户输入链接
podcast_url = input("请输入 Apple Podcast/小宇宙 链接: ")

try:
    # 1. 提取音频 URL 和 HTML 内容
    audio_url, soup = extract_audio_url_by_platform(podcast_url)
    print(f"提取的音频 URL: {audio_url}")

    # 2. 提取标题
    podcast_title = get_podcast_title(soup)
    print(f"提取的博客标题: {podcast_title}")

    # 3. 询问用户是否修改文件名
    custom_title = input(f"请输入文件名（按 n 使用默认标题“{podcast_title}”）: ").strip()
    if custom_title.lower() == 'n' or not custom_title:
        custom_title = podcast_title  # 使用默认值
    print(f"最终使用的文件名: {custom_title}")

    # 4. 下载音频文件
    downloaded_file = download_audio_file(audio_url, custom_title)
    print(f"音频文件已下载: {downloaded_file}")

    # 5. 转换为 WAV 格式
    wav_file = convert_audio_to_wav(downloaded_file, custom_title)
    print(f"音频文件已转换为 WAV 格式: {wav_file}")


except Exception as e:
    print(f"处理过程中发生错误: {e}")

In [None]:
import whisper
import gc

def load_whisper_model_v3(model_name="large-v3"):
    """
    Load the Whisper model (large-v3) from Hugging Face.
    """
    print(f"Loading Whisper model: {model_name}...")
    model = whisper.load_model(model_name)
    print(f"Model {model_name} loaded successfully.")
    return model


def transcribe_audio_v3(model, audio_path, task="transcribe", language=None,verbose=False):
    """
    Transcribe audio using Whisper model (large-v3).
    """
    print(f"Transcribing audio: {audio_path}...")
    result = model.transcribe(audio_path, task=task, language=language, verbose=verbose)#增加了最后一个verbose
    print("Transcription completed.")
    return result


def save_transcription_as_srt_v3(transcription, audio_path):
    """
    Save transcription result as SRT file.
    """
    srt_file_path = os.path.splitext(audio_path)[0] + ".srt"
    with open(srt_file_path, "w", encoding="utf-8") as srt_file:
        for segment in transcription['segments']:
            start = segment['start']
            end = segment['end']
            text = segment['text']
            # Write to SRT format
            srt_file.write(f"{segment['id'] + 1}\n")
            srt_file.write(f"{format_timestamp(start)} --> {format_timestamp(end)}\n")
            srt_file.write(f"{text}\n\n")
    print(f"Transcription saved as SRT: {srt_file_path}")
    return srt_file_path


def format_timestamp(seconds):
    """
    Format timestamp for SRT file.
    """
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}"



# 主程序：模块二集成
if __name__ == "__main__":
    # 假设模块一已经完成，直接获取 WAV 文件路径
    wav_file_path = wav_file  # 上一模块生成的结果

    # 确保文件存在
    if not wav_file_path or not os.path.exists(wav_file_path):
        print(f"错误：音频文件 {wav_file_path} 不存在！请确保模块一已正确运行。")
    else:
        try:
            # 1. 加载 Whisper large-v3 模型
            model = load_whisper_model_v3("large-v3")

            # 2. 转录音频
            transcription_result = transcribe_audio_v3(model, wav_file_path)

            # 3. 保存转录结果为 SRT 文件
            srt_file_path = save_transcription_as_srt_v3(transcription_result, wav_file_path)

            print(f"转录完成！字幕文件保存在: {srt_file_path}")

        except Exception as e:
            print(f"处理过程中发生错误: {e}")

        finally:
            # 显存释放
            print("Releasing GPU memory...")
            del model  # 删除模型对象
            torch.cuda.empty_cache()  # 释放未使用的显存
            gc.collect()  # 强制进行垃圾回收
            print("GPU memory released.")
