# Podcast Episode Erstellen

### imports

In [None]:
import sys
sys.path.append(".")
from audio_transcription.faster_whisper_word_level import transcribe
from Episodes_Downloader.episodes_downloader import get_names_and_urls_all_episodes, download_and_save_mp3_in_dir
import os
import pandas as pd
from pydub import AudioSegment
from pydub.playback import play
from playsound import playsound
import IPython 
import io
from db_connect import db_get_df,db_save_df
from segment_ranking.rank_segments import get_most_similar_documents_Llama2, get_most_similar_documents_tf_idf
# from Audio_segmentation.split_audio import produce_snippets
# from Audio_segmentation.concat_audio import produce_audio
from tqdm import tqdm

### Episoden URLs laden

In [None]:
titles, audio_urls = get_names_and_urls_all_episodes()
print(titles)

In [None]:
print(len(titles))

In [None]:
df = pd.DataFrame({"title": titles, "download_url": audio_urls, "transcript": None})
df.dtypes

In [None]:
db_save_df(df, "transcripts_all")

### Metadaten herunterladen

In [None]:
import sys
sys.path.append(".")
from Episodes_Downloader.episodes_downloader import get_graphql

In [None]:
df = db_get_df("transcripts_all")

In [None]:
df["filename"] = [url.split("/")[-1] for url in df["download_url"]]

In [None]:
df.dtypes

In [None]:
query = """
    {
        programSet(id: 5945518) {
        title
        items(
            orderBy: PUBLISH_DATE_DESC
            filter: {
            isPublished: {
                equalTo: true
            }
            }
        ) {
            nodes {
              keywords
              publishDate
              description
              audios {
                downloadUrl
              }
            }
        }
        }
    }
"""

In [None]:
response = get_graphql(query)

In [None]:
df["description"] = [None] * len(df)
df["publish_date"] = [None] * len(df)
df["keywords"] = [None] * len(df)

In [None]:
import json

In [None]:
for audio_data in response["data"]["programSet"]["items"]["nodes"]:
    audio_filename = audio_data["audios"][0]["downloadUrl"].split("/")[-1]
    matching_row = df[df["filename"] == audio_filename]
    
    if not matching_row.empty:
        matching_index = matching_row.index[0]
        df.at[matching_index, "description"] = audio_data["description"]
        df.at[matching_index, "publish_date"] = audio_data["publishDate"]
        df.at[matching_index, "keywords_json"] = json.dumps(audio_data["keywords"])

In [None]:
df.dtypes

In [None]:
print(df[df["description"].isna()]["filename"])

In [None]:
db_save_df(df, "episodes_metadata")

### Alle mp3s herunterladen

In [None]:
path = "/Volumes/Samsung_T5/Podcast_Episoden"
os.listdir(path)

In [None]:
df = db_get_df("transcripts_all")

for i, row in tqdm(df.iterrows()):
    title = row["download_url"].split("/")[-1]
    if title in os.listdir(path):
        continue
    download_and_save_mp3_in_dir(row["download_url"], path, title)


In [None]:
df.loc[179, "download_url"]

### Transkription auf Word level Ebene

In [None]:
df = transcribe("/Users/br/Projects/Bachelorarbeit/data/Episode_audio_files/jonathan swift gullivers reisen 2.mp3")
# db_save_df(df,"transcript_word_level")

In [None]:
db_save_df(df, "transcript_gulliver_word_level")

### Debug db

In [None]:
import sqlite3
import pandas as pd

con = sqlite3.connect("/Volumes/Samsung_T5/Backup/transcripts.sqlite")
df = pd.read_sql_query(f"SELECT * FROM transcript_word_level_2237", con)
con.close()

In [None]:
len(df)

In [None]:
print(df.dtypes)
print(df["filename"].drop_duplicates(ignore_index=True))

Die Namen der Filenames ändern

In [None]:
df['filename'] = df['filename'].str.replace('/nfs/scratch/students/neumannvi84434/Podcast_Episoden/', '')

In [None]:
with sqlite3.connect("/Volumes/Samsung_T5/Backup/transcripts.sqlite") as con:
    df.to_sql("transcript_word_level_2237", con, index=False, if_exists='replace')

In [None]:
db_save_df(df, "transcript_word_level_2237")

##### setence id add

In [None]:
df =db_get_df("transcript_sentences")

In [None]:
df['segment_id'] = df.groupby('filename').cumcount()

In [None]:
df

In [None]:
db_save_df(df, "transcript_sentences")

### Lemmatisierung mit spacy

In [None]:
import spacy

In [None]:
def lemmatize_german_sentence(input_sentence, nlp):
    doc = nlp(input_sentence)
    lemmatized_words = []
    for token in doc:
        lemmatized_words.append(token.lemma_)
    return lemmatized_words

In [None]:
df = db_get_df("transcript_sentences")
df.dtypes

In [None]:
nlp = spacy.load("de_core_news_md")
all_sentences = []
for sentence in tqdm(df["sentence"]):
    all_sentences.append(lemmatize_german_sentence(sentence, nlp))

In [None]:
all_sentences[3000]

In [None]:
len(all_sentences)

In [None]:
all_sentences_full = [" ".join(sentence) for sentence in all_sentences]

In [None]:
len(all_sentences_full)

In [None]:
df["sentence_lemmatized"] = all_sentences_full

In [None]:
db_save_df(df, "sentences_lemmatized")

In [None]:
all_sentences_full[4000]

### Kompositatrennung mit pyphen

In [None]:
import pyphen

# Erstellen Sie ein Wörterbuch für die deutsche Sprache
deutsche_worte = pyphen.Pyphen(lang='de_DE')

# Set zum Speichern der einzelnen Wörter initialisieren
einzelwoerter_set = set()

# Pfad zur Textdatei
dateipfad = '/Users/br/Projects/Bachelorarbeit/scripts/Embedding_creation/vocabulary.txt'

# Textdatei öffnen und lesen
with open(dateipfad, 'r', encoding='utf-8') as datei:
    for zeile in datei:
        # Wörter in der Zeile aufteilen (z.B. Leerzeichen als Trennzeichen verwenden)
        woerter = zeile.strip().split()
        
        for wort in woerter:
            # Wort trennen
            getrenntes_wort = deutsche_worte.inserted(wort)
            
            # Trennungsergebnis in einzelne Wörter aufteilen
            einzelwoerter = getrenntes_wort.split('-')
            
            # Einzelne Wörter zum Set hinzufügen
            einzelwoerter_set.update(einzelwoerter)

# Ausgabe der einzelnen Wörter im Set
for wort in einzelwoerter_set:
    print(wort)


In [None]:
len(einzelwoerter_set)

In [None]:
from german_compound_splitter import comp_split
from tqdm import tqdm

einzelwoerter_set = set()
dateipfad = '/Users/br/Projects/Bachelorarbeit/scripts/Embedding_creation/vocabulary.txt'
input_file = '/Users/br/Projects/Bachelorarbeit/german.dic'
ahocs = comp_split.read_dictionary_from_file(input_file)

def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False
    

with open(dateipfad, 'r', encoding='utf-8') as datei:

    for zeile in tqdm(datei):
        wort = zeile.strip()
        if is_number(wort):
            continue
        try:
            dissection = comp_split.dissect(wort, ahocs, make_singular=True)
        except:
            print(wort)
            dissection = ""
        einzelwoerter_set.update(wort)

print(len(einzelwoerter_set))
print(einzelwoerter_set)

In [None]:
print(len(einzelwoerter_set))

In [None]:
df = db_get_df("sentences_lemmatized")

In [None]:
word = "seife"
occurences = []
for sentence in tqdm(df["sentence"]):
    if word in sentence.lower():
        occurences.append(sentence)
print(occurences)

In [None]:
from tqdm import tqdm

In [None]:
df = db_get_df("transcript_sentences")

In [None]:
word = "chiemsee"
occurences = []
for sentence in tqdm(df["sentence"]):
    if word in sentence.lower():
        occurences.append(sentence)
print(occurences)

### Satzbildung durch Whisper Punkte

Zunächst erstmal mit Whispers Punkten

In [None]:
df = db_get_df("transcript_word_level_2237")

In [None]:
df

In [None]:
one_sentence = []
sentence_dict = [] 
filenames = df["filename"].drop_duplicates(ignore_index=True)

for filename in tqdm(filenames):
    word_entries = df[df['filename'] == filename]
    start = -1
    end = -1
    for index, word_entry in word_entries.iterrows(): 
        word = word_entry["word"]
        
        if not one_sentence: ##start
            start = word_entry["start"]
        one_sentence.append(word.strip())
        if "." in word:
            end = word_entry["end"]
            sentence = " ".join(one_sentence)
            sentence_dict.append({"filename": filename, "sentence": sentence, "start": start, "end":end})
            one_sentence = []

df_sentences = pd.DataFrame(sentence_dict)

In [None]:
db_save_df(df_sentences, "transcript_sentences")

In [None]:
df_sentences

In [None]:
df

### Demonstration

In [None]:
df = db_get_df(table="transcript_sentences")
df.dtypes

In [None]:
userInputText = "Zugspitze wandern"
userInput_segment_count = 7
best_fitting = get_most_similar_documents_tf_idf(userInputText, userInput_segment_count)
print(best_fitting["sentence"].to_markdown())

produce_snippets()
produce_audio()


In [None]:
IPython.display.Audio("/Users/br/Projects/Bachelorarbeit/scripts/server/audio/concatenated_audio.mp3")

## umwandelen aller MP3s in WAVs

In [None]:
from pydub import AudioSegment
import os
from tqdm import tqdm

def convert_mp3_to_wav(source_dir, target_dir):
    os.makedirs(target_dir, exist_ok=True)
    
    for filename in tqdm(os.listdir(source_dir)):
        if filename.endswith('.mp3') and filename.replace('.mp3', '.wav') not in os.listdir(target_dir) :
            mp3_path = os.path.join(source_dir, filename)
            wav_path = os.path.join(target_dir, filename.replace('.mp3', '.wav'))
            
            
            audio = AudioSegment.from_mp3(mp3_path)
            
            audio.export(wav_path, format="wav")
            print(f"Converted {filename} to WAV and saved to {wav_path}")

# Example usage
source_directory = '/Volumes/Samsung_T5/Podcast_Episoden'
target_directory = '/Volumes/Samsung_T5/Podcast_Episoden_Wav'
convert_mp3_to_wav(source_directory, target_directory)


## Random