# Podcast Episode Erstellen

### imports

In [1]:
import sys

sys.path.append(".")
from episodes_downloader.episodes_downloader import (
    get_metadata_all_episodes,
    download_and_save_mp3_in_dir
)
import os
import pandas as pd
from pydub import AudioSegment
from pydub.playback import play
from playsound import playsound
import IPython
import io
import json

from db_connect import db_get_df, db_save_df
from segment_ranking.rank_segments import (
    get_most_similar_segments,
)

# from Audio_segmentation.split_audio import produce_snippets
# from Audio_segmentation.concat_audio import produce_audio
from tqdm import tqdm

from dotenv import load_dotenv

load_dotenv()
AUDIO_SOURCE_PATH = os.getenv("AUDIO_SOURCE_PATH")
DATA_PATH = os.getenv("DATA_PATH")

playsound is relying on a python 2 subprocess. Please use `pip3 install PyObjC` if you want playsound to run more efficiently.
  from .autonotebook import tqdm as notebook_tqdm


### Episoden URLs laden

Zunächst wollen wir alle Epsioden auflisten und die Download URLs erhalten.

Jetzt werden wir für jeden Podcast noch andere Metadaten, wie die description, das publish date und keywörter abspeichern.

In [None]:
df = get_metadata_all_episodes()
print("Beispieltitel",df["title"].head(5))
print("Anzahl Episoden:",len(df))

In [None]:
df["filename"] = [url.split("/")[-1] for url in df["download_url"]]

In [None]:
db_save_df(df, "episodes_metadata")

### Alle mp3s herunterladen

Um die Audios zu bearbeiten müssen wir sie herunterladen. Da die über 2000 Episoden mp3 ca. 50 GB beanspruchen werden sie hier auf eine externe SSD gespeichert.

In [None]:
path = AUDIO_SOURCE_PATH
os.listdir(path)

In [None]:
df = db_get_df("episodes_metadata")

for i, row in tqdm(df.iterrows()):
    title = row["download_url"].split("/")[-1]
    if title in os.listdir(path):
        continue
    download_and_save_mp3_in_dir(row["download_url"], path, title)


In [None]:
df.loc[179, "download_url"]

### Transkription auf Word level Ebene

Dann müssen die einzelnen Audios transkribiert werden. 
Whisper bietet sich als Transkriptionstool an. Allerdings ist der Prozess alle Audiofiles zu transkribieren sehr aufwändig und sollte auf guter Hardware mit GPU Cuda unterstützung erfolgen.

Beispieltranskription für ein file: (kann ca. 20 min dauern)

In [None]:
from audio_transcription.faster_whisper_word_level import transcribe

In [None]:
filepath = os.path.join(AUDIO_SOURCE_PATH, os.listdir(AUDIO_SOURCE_PATH)[4])
df = transcribe(filepath)

In [None]:
print(df)

### Satzbildung durch Whisper Punkte

Wenn wir nun alle MP3 Datein transkribiert haben, ist für jedes einzelne vorkommende Wort ein Zeitstempel gespeichert.

Für das Projekt lohnt es sich allerdings größere Abschnitte zu erstellen, auf die später die Embeddings und die Suchen angewendet werden können, um mehr Kontext miteinzubeziehen.

In [None]:
df = db_get_df("transcript_word_level_2237")

In [None]:
one_sentence = []
sentence_dict = [] 
filenames = df["filename"].drop_duplicates(ignore_index=True)

for filename in tqdm(filenames):
    word_entries = df[df['filename'] == filename]
    start = -1
    end = -1
    for index, word_entry in word_entries.iterrows(): 
        word = word_entry["word"]
        
        if not one_sentence: ##start
            start = word_entry["start"]
        one_sentence.append(word.strip())
        if "." in word:
            end = word_entry["end"]
            sentence = " ".join(one_sentence)
            sentence_dict.append({"filename": filename, "sentence": sentence, "start": start, "end":end})
            one_sentence = []

df_sentences = pd.DataFrame(sentence_dict)

In [None]:
db_save_df(df_sentences, "transcript_sentences")

### Satzbildung mit Spacy

In [None]:
df = db_get_df("transcript_word_level_2237")

In [48]:
import pandas as pd
import spacy
from tqdm import tqdm

# Load the spaCy model
nlp = spacy.load("de_core_news_md")

# Assuming df is your DataFrame and already defined
filenames = df["filename"].drop_duplicates(ignore_index=True)
sentence_dict = []

for filename in tqdm(filenames):
    word_entries = df[df['filename'] == filename]
    text = " ".join(word_entries["word"].apply(lambda x: x.strip()).tolist())
    
    # Process the text with spaCy
    doc = nlp(text)
    
    sentences = list(doc.sents)
    current_word_index = 0  # To keep track of the word index in word_entries

    for sent in sentences:
        words_in_sent = sent.text.split()
        sentence_length = len(words_in_sent)
        
        if current_word_index + sentence_length > len(word_entries):
            break  
        
        start_time = word_entries.iloc[current_word_index]["start"]
        end_time = word_entries.iloc[current_word_index + sentence_length - 1]["end"]

        sentence_dict.append({"filename": filename, "sentence": sent.text, "start": start_time, "end": end_time})
        
        current_word_index += sentence_length  # Move to the index for the next sentence

df_sentences = pd.DataFrame(sentence_dict)


100%|██████████| 1945/1945 [28:44<00:00,  1.13it/s]


In [51]:
len(df_sentences)

377443

In [50]:
db_save_df(df_sentences, "transcript_sentences_spacy")

### Debug db

In [None]:
import sqlite3
import pandas as pd

con = sqlite3.connect("/Volumes/Samsung_T5/Backup/transcripts.sqlite")
df = pd.read_sql_query(f"SELECT * FROM transcript_word_level_2237", con)
con.close()

In [None]:
len(df)

In [None]:
print(df.dtypes)
print(df["filename"].drop_duplicates(ignore_index=True))

Die Namen der Filenames ändern

In [None]:
df['filename'] = df['filename'].str.replace('/nfs/scratch/students/neumannvi84434/Podcast_Episoden/', '')

In [None]:
with sqlite3.connect("/Volumes/Samsung_T5/Backup/transcripts.sqlite") as con:
    df.to_sql("transcript_word_level_2237", con, index=False, if_exists='replace')

In [None]:
db_save_df(df, "transcript_word_level_2237")

##### Setence ID hinzufügen

In [None]:
df =db_get_df("transcript_sentences")

In [None]:
df['segment_id'] = df.groupby('filename').cumcount()

In [None]:
db_save_df(df, "transcript_sentences")

### Lemmatisierung mit spacy

Um die Keywordsuche mit TF-IDF zu verbessern kann man die einzelnen Wörter vor dem Suchen Lemmatisieren.
Das heißt man mapped mehrere Verwandte Wörter auf ein einziges Wort.

Bsp. Bäume -> Baum; war -> sein; schneller -> schnell 

In [None]:
import spacy

In [None]:
def lemmatize_german_sentence(input_sentence, nlp):
    doc = nlp(input_sentence)
    lemmatized_words = []
    for token in doc:
        lemmatized_words.append(token.lemma_)
    return lemmatized_words

In [None]:
df = db_get_df("transcript_sentences")
df.dtypes

In [None]:
nlp = spacy.load("de_core_news_md")
all_sentences = []
for sentence in tqdm(df["sentence"]):
    all_sentences.append(lemmatize_german_sentence(sentence, nlp))

In [None]:
all_sentences_full = [" ".join(sentence) for sentence in all_sentences]
df["sentence_lemmatized"] = all_sentences_full

In [None]:
db_save_df(df, "sentences_lemmatized")

### Kompositatrennung mit pyphen

In [None]:
import pyphen

deutsche_worte = pyphen.Pyphen(lang='de_DE')
einzelwoerter_set = set()

dateipfad = os.path.join(DATA_PATH, "vocabulary.txt")

with open(dateipfad, 'r', encoding='utf-8') as datei:
    for zeile in datei:
        woerter = zeile.strip().split()
        
        for wort in woerter:
            getrenntes_wort = deutsche_worte.inserted(wort)
            
            einzelwoerter = getrenntes_wort.split('-')
            einzelwoerter_set.update(einzelwoerter)

for wort in einzelwoerter_set:
    print(wort)


In [8]:
print(len(einzelwoerter_set))
print(einzelwoerter_set[3000:3010])

24669
['biebst', 'bied', 'bief', 'bieg', 'biegt', 'bien', 'bier', 'bierl', 'biers', 'bierst']


In [4]:
def save_list_to_file(lst, file_path):
    with open(file_path, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

In [6]:
einzelwoerter_set = sorted(einzelwoerter_set)
save_list_to_file(einzelwoerter_set, os.path.join(DATA_PATH, "vocabulary_pyphen.txt"))

### Kompositatrennung mit german_compound_splitter

In [None]:
from german_compound_splitter import comp_split
from tqdm import tqdm

einzelwoerter_set = set()
dateipfad = os.path.join(DATA_PATH, "vocabulary.txt")
input_file = os.path.join(DATA_PATH, "german.dic")
ahocs = comp_split.read_dictionary_from_file(input_file)

def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False
    

with open(dateipfad, 'r', encoding='utf-8') as datei:

    for zeile in tqdm(datei):
        wort = zeile.strip()
        if is_number(wort):
            continue
        try:
            dissection = comp_split.dissect(wort, ahocs, make_singular=True)
        except:
            dissection = ""
        for split_word in dissection:
            einzelwoerter_set.add(split_word)

print(len(einzelwoerter_set))
# print(einzelwoerter_set)

In [None]:
einzelwoerter_set = sorted(einzelwoerter_set)

In [None]:
save_list_to_file(einzelwoerter_set, os.path.join(DATA_PATH, "vocabulary_compound_split.txt"))

### Demonstration: Suche mit Keywörtern

In [9]:
df_lemmatized = db_get_df("sentences_lemmatized")
df_unlemmatized = db_get_df("transcript_sentences")

In [10]:
def get_occurences(df, word):
    occurences = []
    for sentence in tqdm(df["sentence"]):
        if word in sentence.lower().split():
            occurences.append(sentence)
    return occurences

In [15]:
word = "?"
occurences_lemmatized = get_occurences(df_lemmatized, word)
occurences_unlemmatized = get_occurences(df_unlemmatized, word)
print(f"Occurences Lemmatized({len(occurences_lemmatized)}):",occurences_lemmatized)
print(f"Occurences UnLemmatized({len(occurences_unlemmatized)}):")

  0%|          | 0/370224 [00:00<?, ?it/s]

100%|██████████| 370224/370224 [00:01<00:00, 284255.28it/s]
100%|██████████| 370224/370224 [00:01<00:00, 315795.92it/s]

Occurences Lemmatized(9): ['? Gar keine Wahl, weder super noch normal.', '? Und es ist, wo man fragt, der Benzin -Blusang gesagt.', '? Ach, das Leben ist so üd und ohne Sinn.', '? Ist das nicht gemein, wir haben kein Benzin? Die junge Generation erlebt zum ersten Mal, was ein gewisser Mangel bedeuten kann.', "7, 6, 7, 6, 7, 6, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 7, 6, 8, 9, 1, 2, 3, 4 5, 4, 5, 5, 4, 5 6, 7, 10, 11, 12 ? Major Tom's a drunk kid, strung out in heaven's mind, ? hitting that all -time low.", '? Sitting in the jungle, ? I remember the straw sun.', '? My mother said to get things done, ? to better my mess with Major Tom.', 'Pourquoi est -il devenu un Flaksbauer ? Il y a beaucoup de raisons.', "Et qu




### Demonstration

In [20]:
df = db_get_df(table="transcript_sentences_spacy")
df.dtypes

filename     object
sentence     object
start       float64
end         float64
dtype: object

In [21]:
df_bowie = df[df["filename"] == "david-bowie-das-chamaeleon-des-pop.mp3"]
save_list_to_file(df_bowie["sentence"].to_list(), os.path.join(DATA_PATH, "david_bowie_spacy.txt"))

In [None]:
userInputText = "Zugspitze wandern"
userInput_segment_count = 7
best_fitting = get_most_similar_documents_tf_idf(userInputText, userInput_segment_count)
print(best_fitting["sentence"].to_markdown())

produce_snippets()
produce_audio()


In [None]:
IPython.display.Audio("/Users/br/Projects/Bachelorarbeit/scripts/server/audio/concatenated_audio.mp3")

### Umwandelen aller MP3s in WAVs

In [None]:
from pydub import AudioSegment
import os
from tqdm import tqdm

def convert_mp3_to_wav(source_dir, target_dir):
    os.makedirs(target_dir, exist_ok=True)
    
    for filename in tqdm(os.listdir(source_dir)):
        if filename.endswith('.mp3') and filename.replace('.mp3', '.wav') not in os.listdir(target_dir) :
            mp3_path = os.path.join(source_dir, filename)
            wav_path = os.path.join(target_dir, filename.replace('.mp3', '.wav'))
            
            
            audio = AudioSegment.from_mp3(mp3_path)
            
            audio.export(wav_path, format="wav")
            print(f"Converted {filename} to WAV and saved to {wav_path}")

# Example usage
source_directory = '/Volumes/Samsung_T5/Podcast_Episoden'
target_directory = '/Volumes/Samsung_T5/Podcast_Episoden_Wav'
convert_mp3_to_wav(source_directory, target_directory)
