# Vergleichen verschiedener Embeddings 

In diesem Notebook werden die verschiedenen Embedding Algorithmen demonstriert.

## Imports

In [2]:
import sys

sys.path.append("..")

import json

import joblib
import pandas as pd
from db_connect import db_get_df, db_save_df
from embedding_creation.embedding_creator_TF_IDF import calc_all_tf_idf
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## TF-IDF für alle Daten

### imports

In [None]:
import os
import sys

import numpy as np
from dotenv import load_dotenv
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import joblib
import spacy

sys.path.append("..")

from db_connect import db_get_df, db_save_df, load_pkl, save_pkl, save_npz, load_npz
from segment_ranking.rank_segments import get_most_similar_segments


load_dotenv()
DATA_PATH = os.getenv("DATA_PATH")

### Einfache TF-IDF bilden

In [None]:
df = db_get_df("transcript_sentences")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence'])

In [None]:
print(len(tfidf_vectorizer.get_vocab()))

In [None]:
save_pkl(tfidf_vectorizer,"tfidf_vectorizer_compound_split_87k.pkl")
save_npz(tfidf_matrix, "tf_idf_matrix_compound_split_87k.npz")

In [None]:
df = db_get_df(table="sentences_lemmatized")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence'])

### Lemmatisieren der Daten

In [None]:
tf_idf_matrix = sparse.load_npz("tf_idf_matrix.npz")
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
feature_names = tfidf_vectorizer.get_feature_names_out()
idf_values = tfidf_vectorizer.idf_
idf_dict = dict(zip(feature_names, idf_values))
nlp = spacy.load("de_core_news_md")

def lemmatize_german_sentence(input_sentence, nlp):
    doc = nlp(input_sentence)
    lemmatized_words = []
    for token in doc:
        lemma = token.lemma_
        if lemma:
            lemmatized_words.append(lemma)
        else: 
            lemmatized_words.append(token)
    return lemmatized_words

In [None]:
df = db_get_df("sentences_lemmatized")

In [None]:
sentence = "Wer ist frau meier"
sentence = lemmatize_german_sentence(sentence, nlp)

encoded_words = [(idf_dict[word.lower()], word) for word in sentence if word.lower() in idf_dict]
print(encoded_words)

### Abspeichern

In [None]:
save_npz('tf_idf_matrix200k.npz', tfidf_matrix)

### Laden der Modelle

In [None]:
tf_idf_matrix = load_npz('tf_idf_matrix_230k.npz')
tfidf_vectorizer = load_pkl('tfidf_vectorizer_230k.pkl')

In [None]:
len(tf_idf_matrix)

### Optional speichern des Vocabulars

In [None]:
sorted_dict = dict(sorted(tfidf_vectorizer.get_vocab().items()))
output_file = 'vocabulary.txt'
with open(output_file, 'w') as file:
    for key in sorted_dict.keys():
        file.write(key + '\n')


### Demonstration

In [1]:
calc_all_tf_idf()

NameError: name 'calc_all_tf_idf' is not defined

In [None]:
from scripts.Embedding_creation.embedding_creator_TF_IDF import calc_all_tf_idf, calculate_distances_batchwise


df_tfidf =  calculate_distances_batchwise("Geschichte von Deutschland")

### Lemmatisierung

## Sentence Transformer

SBert
synchron - asynchron

Demonstration

In [4]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df, save_pkl, load_pkl
from sentence_transformers import SentenceTransformer
from embedding_creator_MINI_L6 import MINI_LM_embed
from segment_ranking.rank_segments import get_most_similar_segments

In [5]:
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [6]:
embeddings[0:1000].shape

(1000, 384)

In [7]:
get_most_similar_segments("MINILM","Oktoberfest München", 4, 4)

No embedding method for model type MINILM found
No model for model type MINILM found


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']


embeddings = model.encode(sentences)
save_pkl(sentences, embeddings, "test.pkl")

In [None]:
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [None]:
embeddings[0]

In [None]:
df = db_get_df(table="transcript_sentences")

In [None]:
df = calculate_distances("Oktoberfest in München", df)

In [None]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df, save_pkl, load_pkl
from sentence_transformers import SentenceTransformer
from embedding_creator_MINI_L6 import all_document_embeddings_batchwise_MINI_LM
import pickle

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']


embeddings = model.encode(sentences)
save_pkl(sentences, embeddings, "test.pkl")

In [None]:
sentences, embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [None]:
len(embeddings)

In [None]:
df = db_get_df(table="transcript_sentences")

In [None]:
embeddings = all_document_embeddings_batchwise_MINI_LM(df["sentence"])
save_pkl(sentences, embeddings, "MINI_LM.pkl")

## Sentece Transformer + TF-IDF

In [None]:
import sys
sys.path.append('..')

from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz
import scipy.sparse as sparse
import numpy as np

In [None]:
mini_lm_matrix = load_pkl("MINI_L6_embeddings.pkl")
tf_idf_matrix = load_npz("tf_idf_matrix_230k.npz")

In [None]:
print(mini_lm_matrix.shape)
print(tf_idf_matrix.shape)

In [None]:
if not isinstance(mini_lm_matrix, np.ndarray):
    mini_lm_embeddings = np.array(mini_lm_matrix)

mini_lm_sparse_matrix = sparse.csr_matrix(mini_lm_matrix)

combined_matrix = sparse.hstack([tf_idf_matrix, mini_lm_sparse_matrix], format="csr")

In [None]:
save_npz(combined_matrix, "tf_idf_mini_lm_matrix.npz")

## OpenAI Embeddings

In [2]:
import sys
sys.path.append('..')
from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz
from embeddings_openai import get_embedding_openai
from tqdm import tqdm
import tiktoken
import pandas as pd


MODEL_ID = "text-embedding-3-small"

In [3]:
df = db_get_df("transcript_sentences")

In [35]:
len(df)

433562

In [4]:
encoding = tiktoken.encoding_for_model(MODEL_ID)

In [None]:
encoding.encode("hey, ich komme heute nicht zur party")

In [None]:
encoding.decode([36661, 11, 10864, 11129, 2727, 49714, 8969, 17761, 4717])

In [5]:
tokens = [encoding.encode(sentence) for sentence in tqdm(df["sentence"])]

100%|██████████| 433562/433562 [00:24<00:00, 17625.15it/s]


In [6]:
df["token"] = tokens

In [7]:
df['token_count'] = df["token"].apply(len)

In [1]:

# 1424 iterationen
def batch_sentences(df, max_tokens=8191):

    start_idx = 0
    current_tokens = 0
    for i, row in df.iterrows():
        if current_tokens + row['token_count'] > max_tokens:
            # Yield the current batch of sentences
            yield df.iloc[start_idx:i]["token"]
            start_idx = i
            current_tokens = row['token_count']
        else:
            current_tokens += row['token_count']

    if start_idx < len(df):
        yield df.iloc[start_idx:]["token"]



In [None]:
batch = next(batch_sentences(df))
solo_embed = get_embedding_openai(batch.to_list(), MODEL_ID)
print(solo_embed)

In [None]:
print(len(solo_embed))
print(solo_embed[4])

In [None]:
embeddings = load_pkl("embeddings_OPENAI_252769.pkl")

In [9]:
embeddings = []

In [28]:
df_temp = df.iloc[252769:].reset_index(drop=True)

In [29]:
batch = next(batch_sentences(df_temp))

In [30]:
batch

0      [33717, 289, 8154, 55202, 305, 42303, 9267, 89...
1      [17812, 6127, 10709, 1941, 331, 2357, 372, 408...
2      [17812, 9072, 386, 16317, 14104, 2815, 43886, ...
3      [17812, 16095, 2815, 31331, 5086, 268, 2563, 2...
4      [5001, 86, 20578, 48108, 11168, 2815, 44193, 2...
                             ...                        
305    [18674, 469, 485, 12928, 89285, 11, 5568, 2761...
306    [41, 799, 2807, 747, 531, 96138, 54265, 473, 3...
307    [50, 361, 1344, 2448, 14244, 295, 9267, 304, 6...
308    [6219, 52392, 23935, 13045, 62734, 11586, 301,...
309            [77968, 18955, 380, 11, 294, 662, 71, 13]
Name: token, Length: 310, dtype: object

In [31]:
embedding_batch = get_embedding_openai(batch.to_list(), MODEL_ID)

In [15]:
df.iloc[4]

filename                   baruch-de-spinoza-die-suche-nach-der-wahrheit.mp3
sentence                   Mit der Zustimmung des Heiligen Gottes und die...
start                                                                  49.82
end                                                                    72.56
sentence_lemmatized        Mit der Zustimmung des Heiligen Gottes und die...
sentence_compound_split    Mit der Zustimmung des Gottes und dieser verkü...
segment_id                                                                 4
token                      [55470, 2761, 94816, 12828, 2234, 951, 1283, 3...
token_count                                                               67
Name: 4, dtype: object

In [32]:
for batch in tqdm(batch_sentences(df_temp)):
    embedding_batch = get_embedding_openai(batch.to_list(), MODEL_ID)
    for embedding in embedding_batch:
        embeddings.append(embedding.embedding)

591it [42:17,  4.29s/it]


In [33]:
len(embeddings)

180793

In [None]:
df = pd.DataFrame({"embeddings":embeddings})

In [34]:
save_pkl(embeddings, "embeddings_OPENAI_180793.pkl")

/Users/br/Projects/Bachelorarbeit/data/matrices/embeddings_OPENAI_180793.pkl


In [None]:
embeddings

In [None]:
df["embedding"] = embeddings

In [None]:
def search_reviews(df, product_description, n=3, pprint=True):
   embedding = get_embedding(product_description, model='text-embedding-3-small')
   df['similarities'] = df.ada_embedding.apply(lambda x: cosine_similarity(x, embedding))
   res = df.sort_values('similarities', ascending=False).head(n)
   return res

## LLAMA 2 Embeddings

### Laden der Modelle

In [None]:
model = AutoModel.from_pretrained('mesolitica/llama2-embedding-1b-8k', trust_remote_code = True)
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama2-embedding-1b-8k')

In [None]:
df = db_get_df("transcript_sentences")

### Tokenisierung der Sätze

In [None]:
input_ids = tokenizer(
    df["sentence"].to_list(), 
    return_tensors = 'pt',
    padding = True
)

In [None]:
v = model.encode(input_ids).detach().numpy()
v.shape

### Speichern der Embeddings

In [None]:
df["embedding_json"] = [json.dumps(model.encode(chunk_text).detach().numpy()) for chunk_text in tqdm(input_ids)]


In [None]:
df = db_get_df()
len(df)

In [None]:
table_id = list("bcdyefghij")
all_df = db_get_df("transcript_segments_llama_2_a")
for id in table_id:
    df_temp = db_get_df(f"transcript_segments_llama_2_{id}")
    # print(len(df_temp))
    print(df_temp.head(1).iloc[0, 0])
    all_df = pd.concat([all_df, df_temp])


In [None]:
db_save_df(all_df, "transcript_segments_llama_2_all")

In [None]:
all_df = db_get_df("transcript_segments_llama_2_all")

In [None]:
df = db_get_df()

In [None]:
json_strings = [json.dumps(row.tolist()) for index, row in all_df.iterrows()]
df["embedding_json"] = json_strings

In [None]:
df.iloc[3]

In [None]:
db_save_df(df, "transcript_segments_llama_2")

## Voyage Embeddings

In [17]:
from transformers import AutoTokenizer
from tqdm import tqdm
from db_connect import save_pkl, db_get_df

In [65]:

import os

import voyageai
from dotenv import load_dotenv
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("voyageai/voyage")

load_dotenv()
API_KEY = os.getenv("VOYAGE_API_KEY")

vo = voyageai.Client()


def get_embedding_voyage(input, model="voyage-lite-02-instruct"):
    if isinstance(input, str):
        input = [input.replace("\n", " ")]

    # input = json.dumps(input)
    # print(input)
    result = vo.embed(input, model=model, input_type="document")
    return result.embeddings


In [79]:
def batch_sentences(df, max_tokens=8191, max_sentences=128):
    start_idx = 0
    current_tokens = 0
    sentence_count = 0

    for i, row in enumerate(df.itertuples()):  # Using itertuples() for efficiency
        sentence_tokens = row.token_count  # Assuming 'token_count' column exists
        if current_tokens + sentence_tokens > max_tokens or sentence_count == max_sentences:
            # Yield the current batch of sentences and reset the counters
            yield df.iloc[start_idx:i]["sentence"]
            start_idx = i
            current_tokens = sentence_tokens
            sentence_count = 1  # Reset count and include current sentence in the new batch
        else:
            current_tokens += sentence_tokens
            sentence_count += 1

    # Yield the last batch if there are remaining sentences
    if start_idx < len(df):
        yield df.iloc[start_idx:]["sentence"]


In [5]:
df = db_get_df("transcript_sentences")

In [24]:
tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage')
tokens = [tokenizer.encode(sentence) for sentence in tqdm(df["sentence"])]

100%|██████████| 433562/433562 [01:19<00:00, 5473.22it/s]


In [26]:
df["token"] = tokens

In [27]:
df['token_count'] = df["token"].apply(len)

In [32]:
df['token_count']

23

In [45]:
embeddings = []

In [87]:
for batch in tqdm(batch_sentences(df.iloc[194851:], 3000)):
    embedding_batch = get_embedding_voyage(batch.to_list())
    # print(len(embedding_batch))
    for embedding in embedding_batch:
        embeddings.append(embedding)
save_pkl(embeddings, "embeddings_voyage")

0it [00:00, ?it/s]

2282it [1:10:37,  1.86s/it]


/Users/br/Projects/Bachelorarbeit/data/matrices/embeddings_voyage


In [88]:
len(embeddings)

433562

In [86]:
len(df.iloc[194851:])

238711

In [82]:
save_pkl(embeddings, "voyage_embeds_194851")

/Users/br/Projects/Bachelorarbeit/data/matrices/voyage_embeds_194851


In [None]:
batch_sentences(df)

In [49]:
len(embeddings[0])

1024

In [54]:
gen = batch_sentences(df, 3000)
print(next(gen).to_list())
print(next(gen).to_list())
print(next(gen).to_list())



['Verfolgt und ausgegrenzt.', 'Der niederländische Philosoph Baruch de Spinoza ist schon im 17. Jahrhundert für die demokratische Verfassung eines Staates eingetreten.', 'Die Existenz eines jüdisch -christlichen Schöpfergottes hat er verneint.', 'Nach dem Beschluss der Engel und dem Zeugnis der Heiligen bannen, verstoßen, verwünschen und verfluchen wir Baruch de Spinoza.', 'Mit der Zustimmung des Heiligen Gottes und dieser verkündet die jüdische Gemeinde in Amsterdam mit diesen drastischen Worten, dass sie ihr Gemeindemitglied, den 23 -jährigen Baruch de Spinoza, ab sofort aus ihrer Mitte ausschließt.', 'Mitte des 17. Jahrhunderts konnte ein solcher Bann das soziale Leben eines Menschen vernichten.', 'Zumindest aber machte er die derart ausgeschlossenen zu gesellschaftlichen Außenseitern, die häufig verleumdet oder sogar verfolgt wurden.', 'Genau das war das Leben, das jetzt vor dem jungen Baruch de Spinoza lag.', 'Am 24. November 1632 wurde Baruch de Spinoza als zweiter Sohn jüdischer

In [71]:
input_list = next(gen).to_list()

In [72]:
len(input_list)

19

In [73]:
embedding_batch = get_embedding_voyage(input_list)

In [77]:
len(embedding_batch[2])


1024

In [55]:
for batch in tqdm(batch_sentences(df, 3000)):
    if len(batch) > 127:
        print(len(batch))

40it [00:00, 87.17it/s]

134
155
133


73it [00:00, 90.19it/s]

138
133
129


126it [00:01, 119.54it/s]

135
130
143


151it [00:01, 100.08it/s]

129
131


172it [00:01, 81.90it/s] 

134


204it [00:02, 90.57it/s]

128


234it [00:02, 113.18it/s]

130
140
144
148


269it [00:02, 99.09it/s] 

132
132
142


291it [00:03, 97.92it/s]

131


315it [00:03, 95.10it/s] 

130
157
135
130
136


372it [00:03, 107.79it/s]

140
135
149
155
136
134


413it [00:04, 119.35it/s]

135
139
128


440it [00:04, 118.45it/s]

130
137
132
131
136


464it [00:04, 106.18it/s]

135
128
130
136
134
128


497it [00:05, 96.78it/s] 

129
129
152
188
142
137


526it [00:05, 86.75it/s]

192
133
133


549it [00:05, 94.43it/s]

155
142
132


583it [00:06, 98.03it/s] 

136
141
128
133
139


605it [00:06, 102.05it/s]

128
151


626it [00:06, 83.96it/s] 

132
128
136


660it [00:06, 98.98it/s]

135
152
128


686it [00:07, 109.42it/s]

129


712it [00:07, 109.83it/s]

129
159
131


736it [00:07, 95.70it/s] 

172
146
143
130
133


777it [00:07, 95.88it/s]

129
142
128


835it [00:08, 107.82it/s]

139
129
131
129
132


884it [00:09, 102.16it/s]

133
144
130
137
160
129
139


955it [00:09, 129.17it/s]

142
140
135
130
129


997it [00:09, 128.66it/s]

135
131
132
128


1023it [00:10, 117.58it/s]

132
141
130


1093it [00:10, 132.32it/s]

140
138
129


1136it [00:10, 130.16it/s]

145
128
128
149
144
133
139
131


1177it [00:11, 109.02it/s]

140


1200it [00:11, 107.33it/s]

134
131


1241it [00:11, 127.70it/s]

128
132
128


1299it [00:12, 135.59it/s]

136
135
132
170
150


1341it [00:12, 127.55it/s]

129
162
136
130
139
141


1396it [00:13, 119.33it/s]

159
132
129
131
134
130
131
129


1450it [00:13, 122.54it/s]

129
131
137
133
134
150


1475it [00:13, 111.69it/s]

150
139
131
139


1501it [00:14, 114.52it/s]

139
152
128
140
135
128


1558it [00:14, 131.30it/s]

131
138
134
143
150
132
137
134
153


1586it [00:14, 129.54it/s]

129
128
132
129


1631it [00:14, 135.33it/s]

130
130
133
136
130
130


1660it [00:15, 137.59it/s]

129
133
134
135
133


1691it [00:15, 136.38it/s]

139
137
179
176
144


1736it [00:15, 132.96it/s]

138
135
141
142
133
154


1763it [00:15, 122.41it/s]

135
137
128
133


1789it [00:16, 117.55it/s]

140
128
169
130
133


1839it [00:16, 115.42it/s]

157
148
162


1854it [00:16, 124.70it/s]

137
132


1891it [00:17, 98.73it/s] 

145
131
131
139
141
134


1916it [00:17, 109.24it/s]

136
128
136


1977it [00:17, 106.06it/s]

130
135
128
131
179
134


1999it [00:18, 103.35it/s]

130
161
151
134
128
135


2055it [00:18, 127.36it/s]

140
136
138


2104it [00:19, 95.67it/s] 

140
132
129
134
131


2131it [00:19, 110.23it/s]

144
136
157
135
132


2186it [00:19, 123.14it/s]

143
137
136
137
128
139


2230it [00:20, 137.07it/s]

134
137
133
129


2286it [00:20, 137.65it/s]

138
141
150
131
131
137
129
149
131
130


2317it [00:20, 141.85it/s]

128
130
128
141
151
144
137


2362it [00:21, 138.34it/s]

133
136


2409it [00:21, 148.37it/s]

137
129
129


2439it [00:21, 139.56it/s]

128
135
134
137
133


2468it [00:21, 130.77it/s]

128
132
152
136
134


2507it [00:22, 112.92it/s]

138
134


2531it [00:22, 111.62it/s]

128
134
147


2568it [00:22, 109.89it/s]

128
150
132
159
130
146
139
128
128


2608it [00:23, 124.43it/s]

145
144
141
131
135
160


2621it [00:23, 115.69it/s]

166
132
129


2654it [00:23, 93.26it/s] 

135
131
132
134
147
152
134


2695it [00:23, 113.99it/s]

131
160
170
140


2735it [00:24, 121.13it/s]

129
133
128
133
131
136
130
143
138


2778it [00:24, 131.86it/s]

133


2806it [00:24, 129.16it/s]

187
130
129
135
143
129


2847it [00:25, 130.74it/s]

129
129
144
133
144
146


2907it [00:25, 137.44it/s]

130
135
133


2965it [00:25, 136.42it/s]

141
140
143
128
134


3006it [00:26, 117.22it/s]

141
157
143
143
128
132


3051it [00:26, 127.26it/s]

132
134
128
133
139
140


3077it [00:26, 118.38it/s]

133
133
143
133
134
130


3128it [00:27, 117.16it/s]

128
133


3141it [00:27, 119.51it/s]

137
134


3194it [00:27, 117.86it/s]

130
137


3224it [00:28, 126.22it/s]

129
131
129
135
144
128


3266it [00:28, 134.04it/s]

144
134
128
140
152


3295it [00:28, 132.63it/s]

131
140
139
142
135
142


3326it [00:28, 140.70it/s]

128


3371it [00:29, 142.12it/s]

128
128
133


3401it [00:29, 135.40it/s]

130
137
132
130
128
128


3432it [00:29, 143.40it/s]

131
130


3480it [00:29, 149.76it/s]

129
150
150
147
137
133
128


3525it [00:30, 137.71it/s]

128
153
132
132


3568it [00:30, 138.75it/s]

128
133


3599it [00:30, 144.82it/s]

132
136
133
148
129
138


3644it [00:31, 139.48it/s]

153
147
135
138
131


3689it [00:31, 117.84it/s]

129
136
158
138
139
173
166


3731it [00:31, 128.89it/s]

128
138
129
143


3774it [00:32, 136.54it/s]

140
133
138
135
138
142


3816it [00:32, 131.58it/s]

138
146
137


3863it [00:32, 139.09it/s]

130
141
133
144
130


3909it [00:33, 145.63it/s]

142


3957it [00:33, 141.37it/s]

128
146


4000it [00:33, 135.99it/s]

157
129


4031it [00:34, 143.12it/s]

130
133
163
139


4061it [00:34, 139.99it/s]

134
131
136
134
164
138


4090it [00:34, 127.84it/s]

128
150
135


4125it [00:34, 118.50it/s]

132
132
152
135
129



