# Vergleichen verschiedener Embeddings 

In diesem Notebook werden die verschiedenen Embedding Algorithmen demonstriert.

### imports

In [None]:
import sys

sys.path.append('..')

import json

import joblib
import pandas as pd
from db_connect import db_get_df, db_save_df
from Embedding_creation.embedding_creator_TF_IDF import (
    calc_all_tf_idf,
    calculate_distances_batchwise,
)
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

## LLAMA 2 Embeddings

### Laden der Modelle

In [None]:
model = AutoModel.from_pretrained('mesolitica/llama2-embedding-1b-8k', trust_remote_code = True)
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama2-embedding-1b-8k')

In [None]:
df = db_get_df("transcript_sentences")

### Tokenisierung der Sätze

In [None]:
input_ids = tokenizer(
    df["sentence"].to_list(), 
    return_tensors = 'pt',
    padding = True
)

In [None]:
v = model.encode(input_ids).detach().numpy()
v.shape

### Speichern der Embeddings

In [None]:
df["embedding_json"] = [json.dumps(model.encode(chunk_text).detach().numpy()) for chunk_text in tqdm(input_ids)]


### Vereinigen der Tables

In [None]:
df = db_get_df()
len(df)

In [None]:
table_id = list("bcdyefghij")
all_df = db_get_df("transcript_segments_llama_2_a")
for id in table_id:
    df_temp = db_get_df(f"transcript_segments_llama_2_{id}")
    # print(len(df_temp))
    print(df_temp.head(1).iloc[0, 0])
    all_df = pd.concat([all_df, df_temp])


In [None]:
db_save_df(all_df, "transcript_segments_llama_2_all")

In [None]:
all_df = db_get_df("transcript_segments_llama_2_all")

In [None]:
df = db_get_df()

In [None]:
json_strings = [json.dumps(row.tolist()) for index, row in all_df.iterrows()]
df["embedding_json"] = json_strings

In [None]:
df.iloc[3]

In [None]:
db_save_df(df, "transcript_segments_llama_2")

## TF-IDF für alle Daten

In [None]:
import os
import sys

import numpy as np
from dotenv import load_dotenv
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

sys.path.append('..')

from db_connect import db_get_df, db_save_df, load_pkl, save_pkl, save_npz, load_npz
from embedding_creator_MINI_L6 import calculate_distances
from segment_ranking.rank_segments import get_most_similar_documents_MINI_LM, get_most_similar_documents_tf_idf
from sentence_transformers import SentenceTransformer
import pickle

load_dotenv()
DATA_PATH = os.getenv("DATA_PATH")

### lemmatisieren der Daten

In [None]:
df = db_get_df("sentences_lemmatized")

In [None]:
len(df)

In [None]:

filepath = "/Users/br/Projects/Bachelorarbeit/data/matrices/tfidf_vectorizer200k.pkl"
with open(filepath, "rb") as fIn:
    stored_data = pickle.load(fIn)
    # stored_embeddings = stored_data["embeddings"]
    
# tfidf_vectorizer = load_pkl("tfidf_vectorizer_200k.pkl")

In [None]:
get_most_similar_documents_tf_idf("Oktoberfest bayern", 4)

tf-idf auf lemmatisierte Sätze

In [None]:
df = db_get_df("transcript_sentences")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence'])


# save_pkl(tfidf_vectorizer, 'tfidf_vectorizer200k.pkl')
# print("vectorizer file dumped")

In [None]:
len(tfidf_vectorizer.get_vocab())
save_pkl(tfidf_vectorizer,"tfidf_vectorizer_230k.pkl")

In [None]:
df = db_get_df(table="sentences_lemmatized")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence'])

### Abspeichern

In [None]:
matrix_path = os.path.join(DATA_PATH,'matricies/tf_idf_matrix200k.npz')
sparse.save_npz(matrix_path, tfidf_matrix)

### Laden der Modelle

In [None]:
tf_idf_matrix = load_npz('tf_idf_matrix_230k.npz')
tfidf_vectorizer = load_pkl('tfidf_vectorizer_230k.pkl')

In [None]:
len(tf_idf_matrix)

### Optional speichern des Vocabulars

In [None]:
sorted_dict = dict(sorted(tfidf_vectorizer.get_vocab().items()))
output_file = 'vocabulary.txt'
with open(output_file, 'w') as file:
    for key in sorted_dict.keys():
        file.write(key + '\n')


### Demonstration

In [None]:
calc_all_tf_idf()

In [None]:
from scripts.Embedding_creation.embedding_creator_TF_IDF import calc_all_tf_idf, calculate_distances_batchwise


df_tfidf =  calculate_distances_batchwise("Geschichte von Deutschland")

### IDF Analyse 

In [None]:
from scipy import sparse
import joblib
import spacy

tf_idf_matrix = sparse.load_npz("tf_idf_matrix.npz")
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
feature_names = tfidf_vectorizer.get_feature_names_out()
idf_values = tfidf_vectorizer.idf_
idf_dict = dict(zip(feature_names, idf_values))
nlp = spacy.load("de_core_news_md")

def lemmatize_german_sentence(input_sentence, nlp):
    doc = nlp(input_sentence)
    lemmatized_words = []
    for token in doc:
        lemma = token.lemma_
        if lemma:
            lemmatized_words.append(lemma)
        else: 
            lemmatized_words.append(token)
    return lemmatized_words

In [None]:
df = db_get_df("sentences_lemmatized")

In [None]:
sentence = "Wer ist frau meier"
sentence = lemmatize_german_sentence(sentence, nlp)

encoded_words = [(idf_dict[word.lower()], word) for word in sentence if word.lower() in idf_dict]
print(encoded_words)

## Sentence Transformer

SBert
synchron - asynchron

Demonstration

In [None]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df, save_pkl, load_pkl
from sentence_transformers import SentenceTransformer
from embedding_creator_MINI_L6 import calculate_distances
from segment_ranking.rank_segments import get_most_similar_documents_MINI_LM

In [None]:
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [None]:
embeddings[0:1000].shape

In [None]:
get_most_similar_documents_MINI_LM("Oktoberfest München", 4)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']


embeddings = model.encode(sentences)
save_pkl(sentences, embeddings, "test.pkl")

In [None]:
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [None]:
embeddings[0]

In [None]:
df = db_get_df(table="transcript_sentences")

In [None]:
df = calculate_distances("Oktoberfest in München", df)

In [None]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df, save_pkl, load_pkl
from sentence_transformers import SentenceTransformer
from embedding_creator_MINI_L6 import all_document_embeddings_batchwise_MINI_LM
import pickle

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']


embeddings = model.encode(sentences)
save_pkl(sentences, embeddings, "test.pkl")

In [None]:
sentences, embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [None]:
len(embeddings)

In [None]:
df = db_get_df(table="transcript_sentences")

In [None]:
embeddings = all_document_embeddings_batchwise_MINI_LM(df["sentence"])
save_pkl(sentences, embeddings, "MINI_LM.pkl")

## Sentece Transformer + TF-IDF

In [None]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz
import scipy.sparse as sparse
import numpy as np

In [None]:
mini_lm_matrix = load_pkl("MINI_L6_embeddings.pkl")
tf_idf_matrix = load_npz("tf_idf_matrix_230k.npz")

In [None]:
print(mini_lm_matrix.shape)
print(tf_idf_matrix.shape)

In [None]:
if not isinstance(mini_lm_matrix, np.ndarray):
    mini_lm_embeddings = np.array(mini_lm_matrix)

mini_lm_sparse_matrix = sparse.csr_matrix(mini_lm_matrix)

combined_matrix = sparse.hstack([tf_idf_matrix, mini_lm_sparse_matrix], format="csr")

In [None]:
save_npz(combined_matrix, "tf_idf_mini_lm_matrix.npz")