# Data Visualization

### imports

In [None]:
from audio_segmentation.concat_audio import produce_final_audio
from audio_segmentation.split_audio import produce_audio_snippets
from db_connect import db_get_df
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import mutagen.mp3
import os
from tqdm import tqdm

load_dotenv()
audiofile_path = os.getenv("AUDIO_SOURCE_PATH")

In [None]:
df = db_get_df(table="transcripts_all")
print(len(df))
print(df.dtypes)

Wir verwenden hier das Deutsche model, um die deutschen Transkripte optimal zu encodieren.

### Audio file length

In [None]:
def get_mp3_lengths(directory):
    mp3_lengths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".mp3"):
                try:
                    file_path = os.path.join(root, file)
                    audio = mutagen.mp3.MP3(file_path)
                    mp3_lengths.append(audio.info.length)
                except:
                    print(f"file {file} corrupted")
    return mp3_lengths

def plot_mp3_lengths(mp3_lengths):
    plt.hist(mp3_lengths, bins=100, color='cornflowerblue', edgecolor='black')
    plt.xlabel('MP3 Länge (Sekungen)')
    plt.ylabel('Anzahl MP3 Datein')
    plt.title('Länge der MP3 Datein')
    plt.show()

In [None]:
mp3_directory = audiofile_path
mp3_lengths = get_mp3_lengths(mp3_directory)
plot_mp3_lengths(mp3_lengths)


### Transcript length

In [None]:
df = db_get_df(table="transcript_word_level_2237")

In [None]:
len(df)

In [None]:
filtered_df = df[df['filename'] == 'david-bowie-das-chamaeleon-des-pop.mp3']
filtered_df

In [None]:
def plot_transcript_lengths(transcript_lengths):
    ax = plt.gca()
    ax.set_xlim([1500, 4000])
    plt.hist(transcript_lengths, bins=100, color='cornflowerblue', edgecolor='black')
    plt.xlabel('Transkript Länge (Wörter)')
    plt.ylabel('Anzahl  Transkripte')
    plt.title('Länge der Trankripte')
    plt.show()

In [None]:
lengths = []
filenames = df["filename"].drop_duplicates(ignore_index=True)
for filename in tqdm(filenames):
    word_entries = df[df['filename'] == filename]
    lengths.append(len(word_entries))

In [None]:
plot_transcript_lengths(lengths)

### Länge der Sätze

In [None]:
df = db_get_df(table="transcript_sentences_spacy")

In [None]:
def plot_sentence_lengths(sentence_lengths):
    plt.hist(sentence_lengths, bins=100, color='cornflowerblue', edgecolor='black')
    plt.xlabel('Sätze Länge (Wörter)')
    plt.ylabel('Anzahl  Sätze')
    plt.title('Länge der Sätze')
    plt.show()

In [None]:
df["sentence_lenght"] = df["sentence"].apply(len)
df = df.sort_values(by="sentence_lenght")

In [None]:
df.iloc[-1]["sentence"]

In [None]:
df[df["sentence_lenght"] > 2000]

Logarithmische Normalverteilung

In [None]:
plot_sentence_lengths(df[df["sentence_lenght"] < 500]["sentence_lenght"])

## Anzahl aller Episoden 

In [None]:
import requests 

GRAPHQL_URL = "https://api.ardaudiothek.de/graphql"
def get_graphql(query):
    response = requests.post(GRAPHQL_URL, json={"query": query})
    if response.status_code == 200:
        return response.json()
    else:
        raise f"GraphQL request failed with status code {response.status_code}"

In [None]:
query = """
{
	shows: programSets(
    filter:{
      numberOfElements:{
        greaterThanOrEqualTo: 0
      }
    }
  )
  {totalCount}
  
items(filter:{isPublished:{equalTo:true}}){
    totalCount
  }
}
"""

## Alle Autoren (nicht Sprecher)

In [None]:
query = """
    {
        programSet(id: 5945518) {
        items(
            filter: {
            isPublished: {
                equalTo: true
            }
            }
        ) {
            nodes {
              description
            }
        }
        }
    }
"""

In [None]:
response = get_graphql(query)

In [None]:
[print(autor["description"].split("Autorin: ")[-1].split("Autor: ")[-1]) for autor in response["data"]["programSet"]["items"]["nodes"][:10]]

In [None]:
from tqdm import tqdm
autors = set()

for node in response["data"]["programSet"]["items"]["nodes"]:
    autor = node["description"]
    if "Autor: " in autor or "Autorin: " in autor:
        autor = autor.split("Autorin: ")[-1].split("Autor: ")[-1]
        autor = autor.split("(")[0]
    else:
        autor = ""
    autors.add(autor)

print(len(autors))

## Einzelne Episdoden transkripte analysieren

## Anzahl Nomen herausfinden

In [None]:
from german_compound_splitter import comp_split
from embedding_creation.embedding_creator_TF_IDF import is_number, compound_split_sentence
from tqdm import tqdm
import os
from dotenv import load_dotenv
import spacy

load_dotenv()
AUDIO_SOURCE_PATH = os.getenv("AUDIO_SOURCE_PATH")
DATA_PATH = os.getenv("DATA_PATH")

In [None]:
dateipfad = os.path.join(DATA_PATH, "test_data", "vocabulary_compound_split.txt")
nlp = spacy.load("de_core_news_md")

In [None]:
def sort_words_by_pos(words, sorted_words):
    doc = nlp(" ".join(words))
    
    for token in doc:
        pos = token.pos_
        if pos in sorted_words:
            sorted_words[pos].append(token.text)
        else:
            sorted_words[pos] = [token.text]
    
    return sorted_words

In [None]:
words = []
with open(dateipfad, 'r', encoding='utf-8') as datei:
    for zeile in tqdm(datei):
        word = zeile.strip()
        words.append(word)

In [None]:
doc_size = 50_000
sorted_words = {}
for i in tqdm(range(0,len(words), doc_size)):
    sorted_words = sort_words_by_pos(words[i: i+doc_size], sorted_words)


In [None]:
count = 0
for key in sorted_words.keys():
    print(key, len(sorted_words[key]))
    count += len(sorted_words[key])
print(count)

In [None]:
(len(sorted_words["NOUN"]) + len(sorted_words["PROPN"])) / count

In [None]:
sorted_words["INTJ"]

In [None]:
verbs = sorted(sorted_words["NOUN"], key=len)
print(verbs[-10:])

## Längste Wörter herausfinden

In [None]:
words = []
with open(dateipfad, 'r', encoding='utf-8') as datei:
    for zeile in tqdm(datei):
        word = zeile.strip()
        words.append(word)

In [None]:
words[170000:170010]

In [None]:
long_words = sum(len(s) > 10 for s in words)
print(long_words / len(words))

In [None]:
doc = nlp(vocab_string)

NUM 1736
ADJ 29240
NOUN 68457
ADV 20331
PROPN 65982
X 2140
PUNCT 27
VERB 15147
ADP 568
AUX 229
DET 674
PART 10
CCONJ 103
PRON 174
SCONJ 16
INTJ 1
204835