## Llama 2 Embeddings

### imports

In [10]:
from tqdm import tqdm
import json
import pandas as pd
import sys
sys.path.append('..')

from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from db_connect import db_get_df, db_save_df

from scripts.Embedding_creation.embedding_creator_TF_IDF import calc_all_tf_idf, calculate_distances

  from .autonotebook import tqdm as notebook_tqdm


### manuelle Erstellung

In [None]:


model = AutoModel.from_pretrained('mesolitica/llama2-embedding-1b-8k', trust_remote_code = True)
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama2-embedding-1b-8k')

In [None]:
df = db_get_df("transcript_segments")

In [None]:
len(df["segment_text"][1300:1600].to_list())

In [None]:
input_ids = tokenizer(
    df["segment_text"].to_list(), 
    return_tensors = 'pt',
    padding = True
)

In [None]:
v = model.encode(input_ids).detach().numpy()
v.shape

In [None]:
df["embedding_json"] = [json.dumps(model.encode(chunk_text).detach().numpy()) for chunk_text in tqdm(input_ids)]


In [None]:
input_ids["input_ids"][0]

In [None]:
len(v[0])

### Vereinigen der Tables

In [None]:
df = db_get_df()
len(df)

In [None]:
table_id = list("bcdyefghij")
all_df = db_get_df("transcript_segments_llama_2_a")
for id in table_id:
    df_temp = db_get_df(f"transcript_segments_llama_2_{id}")
    # print(len(df_temp))
    print(df_temp.head(1).iloc[0, 0])
    all_df = pd.concat([all_df, df_temp])


In [None]:
db_save_df(all_df, "transcript_segments_llama_2_all")

In [None]:
all_df = db_get_df("transcript_segments_llama_2_all")

In [None]:
df = db_get_df()

In [None]:
json_strings = [json.dumps(row.tolist()) for index, row in all_df.iterrows()]
df["embedding_json"] = json_strings

In [None]:
df.iloc[3]

In [None]:
db_save_df(df, "transcript_segments_llama_2")

### TF-IDF für alle Daten

In [None]:
import numpy as np
arrays = [
    ["bar", "bar", "bar", "bar", "foo", "foo", "qux", "qux"],
    ["one", "two", "three", "four", "one", "two", "one", "two"],
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
s = pd.Series(np.random.randn(8), index=index)
s = pd.concat([s, pd.Series(np.random.randn(8))])
s

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
from sklearn.metrics.pairwise import linear_kernel
import sys
sys.path.append('..')
from db_connect import db_get_df, db_save_df
import numpy as np
import json
import joblib

df = db_get_df(table="sentences_lemmatized")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence_lemmatized'])
tfidf_array = tfidf_matrix.toarray()

joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
# df['tfidf_representation_json'] =  [json.dumps(tfidf_array[df.index[df['sentence'] == x][0]].tolist()) for x in tqdm(df['sentence'])]
print("vectorizer file dumped and converting to df")

vectorizer file dumped and converting to df


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
df = db_get_df(table="sentences_lemmatized")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence'])

KeyboardInterrupt: 

In [12]:
print(len(tfidf_array[0]))

200341


In [None]:
from scipy import sparse

sparse.save_npz("tf_idf_matrix.npz", tfidf_matrix)


In [None]:
from scipy import sparse
import joblib

tf_idf_matrix = sparse.load_npz("tf_idf_matrix.npz")
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [10]:
sorted_dict = dict(sorted(tfidf_vectorizer.get_vocab().items()))

# Specify the output file name
output_file = 'sorted_keys.txt'

# Open the file for writing
with open(output_file, 'w') as file:
    # Write each key to a separate line in the file
    for key in sorted_dict.keys():
        file.write(key + '\n')


In [None]:
df_embedding = pd.DataFrame(columns=list(range(len(tfidf_array))))
print(df_embedding)

In [None]:
for i, representation in tqdm(enumerate(tfidf_array)):
    print(representation)
    for tf_idf_index in representation:
        df.loc[i, f"(tfidf_representation, {i})"] = "rep"
print("finished")
print(df)
db_save_df(df, tablename="transcript_sentences_tf_idf")

In [None]:
print(len(tfidf_array))
print(len(df))
print()
df['tfidf_representation'] = [0] * len(tfidf_array)

In [None]:
calc_all_tf_idf()

In [None]:
from scripts.Embedding_creation.embedding_creator_TF_IDF import calc_all_tf_idf, calculate_distances


df_tfidf =  calculate_distances("Geschichte von Deutschland")

## eigene idf 

In [6]:
from scipy import sparse
import joblib
import spacy

tf_idf_matrix = sparse.load_npz("tf_idf_matrix.npz")
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
feature_names = tfidf_vectorizer.get_feature_names_out()
idf_values = tfidf_vectorizer.idf_
idf_dict = dict(zip(feature_names, idf_values))
nlp = spacy.load("de_core_news_md")

def lemmatize_german_sentence(input_sentence, nlp):
    doc = nlp(input_sentence)
    lemmatized_words = []
    for token in doc:
        lemma = token.lemma_
        if lemma:
            lemmatized_words.append(lemma)
        else: 
            lemmatized_words.append(token)
    return lemmatized_words

In [4]:
df = db_get_df("sentences_lemmatized")

In [7]:
sentence = "Wer ist frau meier"
sentence = lemmatize_german_sentence(sentence, nlp)

encoded_words = [(idf_dict[word.lower()], word) for word in sentence if word.lower() in idf_dict]
print(encoded_words)

[(4.895880118288627, 'wer'), (2.072407238502506, 'sein'), (5.2368274695944, 'Frau'), (9.971718606195646, 'Meier')]


In [12]:
df = db_get_df("sentences_lemmatized")


In [26]:
word = sorted(encoded_words, key=lambda x:x[0])[-1][1].lower()
print("lek",word)
occurences = []
# for i, row in tqdm(df.iterrows()):
#     sentence = row["sentence"]
#     if word in sentence.lower():
#         occurences.append(row)
for sentence in tqdm(df["sentence"]):
    if word in sentence.lower():
        occurences.append(word)
print(len(occurences))

lek meier


100%|██████████| 370224/370224 [00:00<00:00, 529709.59it/s]

274





In [15]:
sorted(idf_dict.items(), key=lambda x:x[1])[-100:]

[('überwachsener', 13.128719027345758),
 ('überwachungsapparat', 13.128719027345758),
 ('überwachungsausschuß', 13.128719027345758),
 ('überwachungsausschüssen', 13.128719027345758),
 ('überwachungsgerat', 13.128719027345758),
 ('überwachungsinstitut', 13.128719027345758),
 ('überwachungskamera', 13.128719027345758),
 ('überwachungskomponente', 13.128719027345758),
 ('überwachungsschiff', 13.128719027345758),
 ('überwachungstechnisch', 13.128719027345758),
 ('überwanden', 13.128719027345758),
 ('überwarf', 13.128719027345758),
 ('überwasser', 13.128719027345758),
 ('überweg', 13.128719027345758),
 ('überweiden', 13.128719027345758),
 ('überwese', 13.128719027345758),
 ('überwiegelnd', 13.128719027345758),
 ('überwind', 13.128719027345758),
 ('überwindbar', 13.128719027345758),
 ('überwinder', 13.128719027345758),
 ('überwindet', 13.128719027345758),
 ('überwindungshürde', 13.128719027345758),
 ('überwinter', 13.128719027345758),
 ('überwinternde', 13.128719027345758),
 ('überwinterungs