In [None]:
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import json

# Environment Variables

In [None]:
chunks_csv_path = ""    # TODO: Path to the csv-file from '02_Chunking.ipynb'
json_output_path = ""   # TODO: Path to the json file for upsertion to pinecone

# Create embeddings of the chunks

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
df = pd.read_csv(chunks_csv_path, sep=";", encoding="utf-8")
chunks_text = df["text"].tolist()

In [None]:
# Embed
batch_size = 128

embeddings = []
# chunks_vec = model.encode(chunks_text)
for i in tqdm(range(0, len(chunks_text), batch_size), desc="Embedding Chunks..."):
    end = min(len(chunks_text), i+batch_size)
    batch = chunks_text[i:end]
    batch_embeddings = model.encode(batch)
    embeddings.extend(batch_embeddings)

## (Optional) Save embeddings as .npy 

In [None]:
embeddings_path = ""    # TODO: Path for embeddings to be saved as .npy
np.save(f"{embeddings_path}/embeddings.npy", embeddings)

In [None]:
# load embeddings:
embeddings = np.load(f"{embeddings_path}/embeddings.npy")

## Identify metadata of chunk and create format for upsertion

In [None]:
def get_metadata_dict(file_nr: int) -> str:
    """
    Retrives the Metadata of a modulehandbook, based on the file number.
    :param file_nr: The file number of the modulehandbook.
    :return: Metadatainformation as dict.
    """
    df_meta_data = pd.read_csv(r"C:\Users\Anton\Desktop\RAG-Data\Modulhandb√ºcher2024\study_programs_filtered.csv", sep=";", encoding="latin1")
    metadata = df_meta_data.loc[df_meta_data['identifier'] == file_nr,].reset_index(drop=True)
    wanted_information = ["study_program", "university", "study_form", "degree", "type", "identifier", "location", "website", "fields"]
    metadata_list = metadata.loc[0, wanted_information].astype(str).tolist()
    metadata_dict = {
            "Modulhandbuch Nr.": metadata_list[5],
            "Studiengang": metadata_list[0],
            "Grad": metadata_list[3],
            "Studiengang-Form": metadata_list[2],
            "Hochschule": metadata_list[1],
            "Typ": metadata_list[4],
            "Standort": metadata_list[6],
            "Themenfelder": metadata_list[8],
            "Link": metadata_list[7]
    }
    
    return metadata_dict

In [None]:
resulting_list = [] # list of embeddings per chunk with metadata

for row in tqdm(df.itertuples(index=False), total=len(chunks_text), desc="Embedding Chunks..."):
    identifier = f"mh{row[0]}ch{row[1]}"
    values = model.encode(row[2])
    metadata = get_metadata_dict(row[0])
    # create entry based on pinecone upesertion-structure
    entry = {
        "id": identifier,
        "values": values,
        "metadata": metadata
    }
    resulting_list.append(entry)    # type = ndarray

In [None]:
def convert_ndarray_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_ndarray_to_list(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_ndarray_to_list(i) for i in obj]
    else:
        return obj

In [None]:
resulting_list_converted = convert_ndarray_to_list(resulting_list)

# Save embeddings with metadata per chunk as json

In [None]:
with open(json_output_path, 'w', encoding='utf-8') as f:
    json.dump(resulting_list_converted, f, ensure_ascii=False, indent=4)