# Podcast Episode Erstellen

### imports

In [None]:
import sys

sys.path.append(".")
from audio_downloader.episodes_downloader import (
    get_metadata_all_episodes,
    download_and_save_mp3_in_dir
)
import os
import pandas as pd
from pydub import AudioSegment
from pydub.playback import play
from playsound import playsound
import IPython
import io
import json

from db_connect import db_get_df, db_save_df
from segment_ranking.rank_segments import (
    get_most_similar_segments,
)

# from Audio_segmentation.split_audio import produce_snippets
# from Audio_segmentation.concat_audio import produce_audio
from tqdm import tqdm

from dotenv import load_dotenv

load_dotenv()
AUDIO_SOURCE_PATH = os.getenv("AUDIO_SOURCE_PATH")
DATA_PATH = os.getenv("DATA_PATH")

In [None]:
AUDIO_SOURCE_PATH

### Episoden URLs laden

Zunächst wollen wir alle Epsioden auflisten und die Download URLs erhalten.

Jetzt werden wir für jeden Podcast noch andere Metadaten, wie die description, das publish date und keywörter abspeichern.

In [None]:
from audio_downloader.episodes_downloader import (
    get_metadata_all_episodes,
    download_and_save_mp3_in_dir
)
from db_connect import db_get_df, db_save_df

In [None]:
df = get_metadata_all_episodes()
print("Beispieltitel",df["title"].head(5))
print("Anzahl Episoden:",len(df))

In [None]:
df["filename"] = [url.split("/")[-1] for url in df["download_url"]]

In [None]:
db_save_df(df, "episodes_metadata")

### Alle MP3 Datein herunterladen

Um die Audios zu bearbeiten müssen wir sie herunterladen. Da die über 2000 Episoden mp3 ca. 50 GB beanspruchen werden sie hier auf eine externe SSD gespeichert.

In [None]:
import os
from tqdm import tqdm
from db_connect import db_get_df, db_save_df
from dotenv import load_dotenv

load_dotenv()
AUDIO_SOURCE_PATH = os.getenv("AUDIO_SOURCE_PATH")
path = AUDIO_SOURCE_PATH
len(os.listdir(path))

In [None]:
df = db_get_df("episodes_metadata")

for i, row in tqdm(df.iterrows()):
    title = row["download_url"].split("/")[-1]
    if title in os.listdir(path):
        continue
    download_and_save_mp3_in_dir(row["download_url"], path, title)

In [None]:
df.loc[179, "download_url"]

### Transkription auf Word level Ebene

Dann müssen die einzelnen Audios transkribiert werden. 
Whisper bietet sich als Transkriptionstool an. Allerdings ist der Prozess alle Audiofiles zu transkribieren sehr aufwändig und sollte auf guter Hardware mit GPU Cuda unterstützung erfolgen.

Beispieltranskription für ein file: (kann ca. 20 min dauern)

In [None]:
import os
from audio_transcription.faster_whisper_word_level import transcribe
from dotenv import load_dotenv

load_dotenv()
AUDIO_SOURCE_PATH = os.getenv("AUDIO_SOURCE_PATH")

In [None]:
filepath = os.path.join(AUDIO_SOURCE_PATH, os.listdir(AUDIO_SOURCE_PATH)[4])
df = transcribe(filepath)

In [None]:
print(df)

Alle MP3s sind auf Wort Ebene transkribiert und in der Tabelle transcript_word_level_2237 abgespeichert.

### Satzbildung mit Spacy

In [None]:
import pandas as pd
import spacy
from tqdm import tqdm
from db_connect import db_get_df, db_save_df

In [None]:
def get_sentences_spacy(df, nlp):
    filenames = df["filename"].drop_duplicates(ignore_index=True)
    sentence_dict = []

    for filename in tqdm(filenames):
        word_entries = df[df['filename'] == filename]
        text = " ".join(word_entries["word"].apply(lambda x: x.strip()).tolist())
        
        # Process the text with spaCy
        doc = nlp(text)
        
        sentences = list(doc.sents)
        current_word_index = 0  # To keep track of the word index in word_entries

        for sent in sentences:
            words_in_sent = sent.text.split()
            sentence_length = len(words_in_sent)
            
            if current_word_index + sentence_length > len(word_entries):
                break  
            
            start_time = word_entries.iloc[current_word_index]["start"]
            end_time = word_entries.iloc[current_word_index + sentence_length - 1]["end"]

            sentence_dict.append({"filename": filename, "sentence": sent.text, "start": start_time, "end": end_time})
            
            current_word_index += sentence_length  # Move to the index for the next sentence

    df_sentences = pd.DataFrame(sentence_dict)
    return df_sentences

In [None]:
df = db_get_df("transcript_word_level")
nlp = spacy.load("de_dep_news_trf") # model is 0.04% better in sentence than de_core_news_md

In [None]:
df_sentences = get_sentences_spacy(df, nlp)

In [None]:
df['segment_id'] = df.groupby('filename').cumcount()

In [None]:
df[""]

In [None]:
len(df_sentences)

In [None]:
db_save_df(df_sentences, "transcript_sentences_spacy")

### Lemmatisierung mit spacy

Um die Keywordsuche mit TF-IDF zu verbessern kann man die einzelnen Wörter vor dem Suchen Lemmatisieren.
Das heißt man mapped mehrere Verwandte Wörter auf ein einziges Wort.
Dabei werden in der deutschen Sprache 
- Alle Nomen zu Nominativ Singular
- Alle Verben zu Infinitiv Präsenz aktiv

Bsp. Bäume -> Baum; war -> sein; schneller -> schnell 

In [None]:
import spacy
from embedding_creation.embedding_creator_TF_IDF import lemmatize_german_sentence

In [None]:
df = db_get_df("transcript_sentences_spacy")
df.dtypes

In [None]:
nlp = spacy.load("de_core_news_md")
all_sentences = []
for sentence in tqdm(df["sentence"]):
    all_sentences.append(lemmatize_german_sentence(sentence, nlp))

In [None]:
all_sentences_full = [" ".join(sentence) for sentence in all_sentences]
df["sentence_lemmatized"] = all_sentences_full

In [None]:
db_save_df(df, "sentences_lemmatized")

### Kompositatrennung mit german_compound_splitter

In [None]:
from german_compound_splitter import comp_split
from embedding_creation.embedding_creator_TF_IDF import is_number, compound_split_sentence
from tqdm import tqdm
import pandas as pd
from db_connect import db_get_df, db_save_df
import os
from dotenv import load_dotenv

load_dotenv()
DATA_PATH = os.getenv("DATA_PATH")

In [None]:
def compound_split_df(df, ahocs):
    df_temp = df.copy()
    compound_split_sentences = []
    for sentence in tqdm(df_temp["sentence"]):
        compound_split_sentences.append(compound_split_sentence(sentence, ahocs))
    df_temp["sentence"] = compound_split_sentences
    return df

In [None]:
df = db_get_df("transcript_sentences")

In [None]:
db_save_df(df, "transcript_sentences")

In [None]:
input_file = os.path.join(DATA_PATH, "german.dic")
ahocs = comp_split.read_dictionary_from_file(input_file)

In [None]:
df_compound_split = compound_split_df(df, ahocs)

In [None]:
df["sentence"][4000:4005]

In [None]:
df_compound_split["sentence"][4000:4005]

In [None]:
db_save_df(df_compound_split, "sentences_compound_split")

### Demonstration: Suche mit Keywörtern

In [None]:
df_lemmatized = db_get_df("sentences_lemmatized")
df_unlemmatized = db_get_df("transcript_sentences")

In [None]:
def get_occurences(df, word):
    occurences = []
    for sentence in tqdm(df["sentence"]):
        if word in sentence.lower().split():
            occurences.append(sentence)
    return occurences

In [None]:
word = "?"
occurences_lemmatized = get_occurences(df_lemmatized, word)
occurences_unlemmatized = get_occurences(df_unlemmatized, word)
print(f"Occurences Lemmatized({len(occurences_lemmatized)}):",occurences_lemmatized)
print(f"Occurences UnLemmatized({len(occurences_unlemmatized)}):")

### Demonstration

In [None]:
df = db_get_df(table="transcript_sentences_spacy")
df.dtypes

In [None]:
df_bowie = df[df["filename"] == "david-bowie-das-chamaeleon-des-pop.mp3"]
save_list_to_file(df_bowie["sentence"].to_list(), os.path.join(DATA_PATH, "david_bowie_spacy.txt"))

In [None]:
userInputText = "Zugspitze wandern"
userInput_segment_count = 7
best_fitting = get_most_similar_documents_tf_idf(userInputText, userInput_segment_count)
print(best_fitting["sentence"].to_markdown())

produce_snippets()
produce_audio()


In [None]:
IPython.display.Audio("/Users/br/Projects/Bachelorarbeit/scripts/server/audio/concatenated_audio.mp3")

### Umwandelen aller MP3s in WAVs

In [None]:
from pydub import AudioSegment
import os
from tqdm import tqdm

def convert_mp3_to_wav(source_dir, target_dir):
    os.makedirs(target_dir, exist_ok=True)
    
    for filename in tqdm(os.listdir(source_dir)):
        if filename.endswith('.mp3') and filename.replace('.mp3', '.wav') not in os.listdir(target_dir) :
            mp3_path = os.path.join(source_dir, filename)
            wav_path = os.path.join(target_dir, filename.replace('.mp3', '.wav'))
            
            
            audio = AudioSegment.from_mp3(mp3_path)
            
            audio.export(wav_path, format="wav")
            print(f"Converted {filename} to WAV and saved to {wav_path}")

# Example usage
source_directory = '/Volumes/Samsung_T5/Podcast_Episoden'
target_directory = '/Volumes/Samsung_T5/Podcast_Episoden_Wav'
convert_mp3_to_wav(source_directory, target_directory)
