# Vergleichen verschiedener Embeddings 

In diesem Notebook werden die verschiedenen Embedding Algorithmen demonstriert.

### imports

In [1]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df
import joblib
import json
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from Embedding_creation.embedding_creator_TF_IDF import calc_all_tf_idf, calculate_distances_batchwise
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## LLAMA 2 Embeddings

### Laden der Modelle

In [None]:
model = AutoModel.from_pretrained('mesolitica/llama2-embedding-1b-8k', trust_remote_code = True)
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama2-embedding-1b-8k')

In [None]:
df = db_get_df("transcript_sentences")

### Tokenisierung der Sätze

In [None]:
input_ids = tokenizer(
    df["sentence"].to_list(), 
    return_tensors = 'pt',
    padding = True
)

In [None]:
v = model.encode(input_ids).detach().numpy()
v.shape

### Speichern der Embeddings

In [None]:
df["embedding_json"] = [json.dumps(model.encode(chunk_text).detach().numpy()) for chunk_text in tqdm(input_ids)]


### Vereinigen der Tables

In [None]:
df = db_get_df()
len(df)

In [None]:
table_id = list("bcdyefghij")
all_df = db_get_df("transcript_segments_llama_2_a")
for id in table_id:
    df_temp = db_get_df(f"transcript_segments_llama_2_{id}")
    # print(len(df_temp))
    print(df_temp.head(1).iloc[0, 0])
    all_df = pd.concat([all_df, df_temp])


In [None]:
db_save_df(all_df, "transcript_segments_llama_2_all")

In [None]:
all_df = db_get_df("transcript_segments_llama_2_all")

In [None]:
df = db_get_df()

In [None]:
json_strings = [json.dumps(row.tolist()) for index, row in all_df.iterrows()]
df["embedding_json"] = json_strings

In [None]:
df.iloc[3]

In [None]:
db_save_df(df, "transcript_segments_llama_2")

## TF-IDF für alle Daten

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import os
from dotenv import load_dotenv

load_dotenv()
DATA_PATH = os.getenv("DATA_PATH")

tf-idf auf lemmatisierte Sätze

In [None]:
df = db_get_df(table="sentences_lemmatized")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence_lemmatized'])
tfidf_array = tfidf_matrix.toarray()

joblib.dump(tfidf_vectorizer, os.path.join(DATA_PATH,'matricies/tfidf_vectorizer200k.pkl'))
print("vectorizer file dumped")

In [None]:
df = db_get_df(table="sentences_lemmatized")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence'])

### Abspeichern

In [None]:
matrix_path = os.path.join(DATA_PATH,'matricies/tf_idf_matrix200k.npz')
sparse.save_npz(matrix_path, tfidf_matrix)

### Laden der Modelle

In [None]:
matrix_path = os.path.join(DATA_PATH,'matricies/tf_idf_matrix230k.npz')
vectorizer_path = os.path.join(DATA_PATH,'matricies/tfidf_vectorizer230k.pkl')
tf_idf_matrix = sparse.load_npz(matrix_path)
tfidf_vectorizer = joblib.load(vectorizer_path)

### Optional speichern des Vocabulars

In [None]:
sorted_dict = dict(sorted(tfidf_vectorizer.get_vocab().items()))
output_file = 'vocabulary.txt'
with open(output_file, 'w') as file:
    for key in sorted_dict.keys():
        file.write(key + '\n')


### Demonstration

In [None]:
calc_all_tf_idf()

In [None]:
from scripts.Embedding_creation.embedding_creator_TF_IDF import calc_all_tf_idf, calculate_distances_batchwise


df_tfidf =  calculate_distances_batchwise("Geschichte von Deutschland")

### IDF Analyse 

In [None]:
from scipy import sparse
import joblib
import spacy

tf_idf_matrix = sparse.load_npz("tf_idf_matrix.npz")
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
feature_names = tfidf_vectorizer.get_feature_names_out()
idf_values = tfidf_vectorizer.idf_
idf_dict = dict(zip(feature_names, idf_values))
nlp = spacy.load("de_core_news_md")

def lemmatize_german_sentence(input_sentence, nlp):
    doc = nlp(input_sentence)
    lemmatized_words = []
    for token in doc:
        lemma = token.lemma_
        if lemma:
            lemmatized_words.append(lemma)
        else: 
            lemmatized_words.append(token)
    return lemmatized_words

In [None]:
df = db_get_df("sentences_lemmatized")

In [None]:
sentence = "Wer ist frau meier"
sentence = lemmatize_german_sentence(sentence, nlp)

encoded_words = [(idf_dict[word.lower()], word) for word in sentence if word.lower() in idf_dict]
print(encoded_words)

## Sentence Transformer

SBert
synchron - asynchron

Demonstration

In [1]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df, save_pkl, load_pkl
from sentence_transformers import SentenceTransformer
from embedding_creator_MINI_L6 import calculate_distances
from segment_ranking.rank_segments import get_most_similar_documents_MINI_LM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [6]:
embeddings[0:1000].shape

(1000, 384)

In [2]:
get_most_similar_documents_MINI_LM("Oktoberfest München", 4)

100%|██████████| 371/371 [00:01<00:00, 317.83it/s]


das-muenchener-oktoberfest-brotzeit-bier-und-belustigung.mp3
139
heinrich-himmler-chef-der-ss-pedantischer-massenmoerder.mp3
14
muenchen-um-1900-arm-und-reich.mp3
8
das-indianerbild-in-deutschland-phantasien-vom-roten-bruder.mp3
94


Unnamed: 0,filename,sentence,start,end,segment_id,distance
0,das-muenchener-oktoberfest-brotzeit-bier-und-b...,Wolf hat in seinem Bierrausch eine Frau fast t...,796.64,825.0,139,0.231074
1,heinrich-himmler-chef-der-ss-pedantischer-mass...,"Und er ist natürlich kein Charismatiker, wie z...",159.6,197.7,14,0.253953
2,muenchen-um-1900-arm-und-reich.mp3,Auffällig und unübersehbar arm. München im Jah...,28.22,46.12,8,0.26476
3,das-indianerbild-in-deutschland-phantasien-vom...,"Ich denke, dass der Buffalo Bill da schuld war...",555.06,578.46,94,0.288305


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']


embeddings = model.encode(sentences)
save_pkl(sentences, embeddings, "test.pkl")

In [2]:
sentences, embeddings = load_pkl("MINI_L6_embeddings.pkl")

UnpicklingError: pickle data was truncated

In [2]:
df = db_get_df(table="transcript_sentences")

In [3]:
df = calculate_distances("Oktoberfest in München", df)

EOFError: 

In [None]:
print()