In [2]:
# Step 1: load data and check translator row

import pandas as pd

# CSV laden (angenommen, du hast den DataFrame 'df' bereits)
df = pd.read_csv("metadata.csv")  # Pfad anpassen

# Nur relevante Spalten nehmen (kannst du anpassen)
df_small = df[['Translator']]

# Translator-Spalte checken
print("Erste 5 Translator-Werte:")
print(df_small['Translator'].head())

# Eindeutige Translator
unique_translators = df_small['Translator'].unique()
print(f"\nAnzahl eindeutiger Translator: {len(unique_translators)}")
print("Translator Liste:", unique_translators)


Erste 5 Translator-Werte:
0    kas
1    kas
2    kas
3    kas
4    bec
Name: Translator, dtype: object

Anzahl eindeutiger Translator: 21
Translator Liste: ['kas' 'bec' 'chm' 'dod' 'mof' 'pap' 'tih' 'mas' 'frp' 'alp' 'paw' 'ans'
 'pam' 'frs' 'evs' 'frf' 'lyj' 'edm' 'anv' 'jog' 'beg']


In [3]:
# Step 2: create list with line indices for each translator

from collections import defaultdict

# Dictionary: translator -> Liste aller Zeilenindices mit diesem Translator
translator_to_indices = defaultdict(list)

for idx, translator in enumerate(df_small['Translator']):
    translator_to_indices[translator].append(idx)

# Beispielausgabe: Anzahl Zeilen pro Translator
for translator, indices in translator_to_indices.items():
    print(f"Translator '{translator}' hat {len(indices)} Zeilen")


Translator 'kas' hat 24 Zeilen
Translator 'bec' hat 529 Zeilen
Translator 'chm' hat 1145 Zeilen
Translator 'dod' hat 35 Zeilen
Translator 'mof' hat 1445 Zeilen
Translator 'pap' hat 32 Zeilen
Translator 'tih' hat 120 Zeilen
Translator 'mas' hat 1356 Zeilen
Translator 'frp' hat 90 Zeilen
Translator 'alp' hat 68 Zeilen
Translator 'paw' hat 520 Zeilen
Translator 'ans' hat 148 Zeilen
Translator 'pam' hat 47 Zeilen
Translator 'frs' hat 6 Zeilen
Translator 'evs' hat 75 Zeilen
Translator 'frf' hat 263 Zeilen
Translator 'lyj' hat 245 Zeilen
Translator 'edm' hat 2 Zeilen
Translator 'anv' hat 28 Zeilen
Translator 'jog' hat 70 Zeilen
Translator 'beg' hat 202 Zeilen


In [4]:
# Step 3: create triplets (each row = anchor once), only with meta data, embeddings in step 4

import random
import numpy as np

# Für reproducible Ergebnisse
random.seed(42)

# Triplets: Liste von (anchor_idx, pos_idx, neg_idx)
triplets = []

# Anzahl der Zeilen
num_rows = len(df_small)

for anchor_idx in range(num_rows):
    anchor_translator = df_small.loc[anchor_idx, 'Translator']
    
    pos_candidates = translator_to_indices[anchor_translator].copy()
    # Anchor selbst ausschließen
    pos_candidates = [idx for idx in pos_candidates if idx != anchor_idx]
    
    if not pos_candidates:
        # Falls kein anderes Beispiel mit gleichem Translator (sehr selten)
        continue
    
    pos_idx = random.choice(pos_candidates)
    
    # Negativ: Translator anders als Anchor
    neg_translators = [t for t in unique_translators if t != anchor_translator]
    neg_translator = random.choice(neg_translators)
    neg_idx = random.choice(translator_to_indices[neg_translator])
    
    triplets.append((anchor_idx, pos_idx, neg_idx))
    
    # Zwischenausgabe für die ersten 5 Triplets
    if len(triplets) <= 5:
        print(f"Triplet {len(triplets)}:")
        print(f"  Anchor idx {anchor_idx} (Translator: {anchor_translator})")
        print(f"  Positiv idx {pos_idx} (Translator: {anchor_translator})")
        print(f"  Negativ idx {neg_idx} (Translator: {neg_translator})")
        print()
        
print(f"Anzahl erzeugter Triplets: {len(triplets)}")


Triplet 1:
  Anchor idx 0 (Translator: kas)
  Positiv idx 6431 (Translator: kas)
  Negativ idx 430 (Translator: mof)

Triplet 2:
  Anchor idx 1 (Translator: kas)
  Positiv idx 4776 (Translator: kas)
  Negativ idx 855 (Translator: frp)

Triplet 3:
  Anchor idx 2 (Translator: kas)
  Positiv idx 4772 (Translator: kas)
  Negativ idx 6370 (Translator: mof)

Triplet 4:
  Anchor idx 3 (Translator: kas)
  Positiv idx 6115 (Translator: kas)
  Negativ idx 5181 (Translator: dod)

Triplet 5:
  Anchor idx 4 (Translator: bec)
  Positiv idx 84 (Translator: bec)
  Negativ idx 2 (Translator: kas)

Anzahl erzeugter Triplets: 6450


In [6]:
# Step 4: create triplets with embeddings and save them

import numpy as np

# Embeddings laden
embeddings = np.load("embeddings.npy")
print("Embedding-Shape:", embeddings.shape)  # z.B. (6450, 1024)

# Triplet-Vektoren: Listen für Anchor, Positiv, Negativ
anchors = []
positives = []
negatives = []

for (anchor_idx, pos_idx, neg_idx) in triplets:
    anchors.append(embeddings[anchor_idx])
    positives.append(embeddings[pos_idx])
    negatives.append(embeddings[neg_idx])

# In NumPy-Arrays umwandeln
anchors = np.array(anchors)
positives = np.array(positives)
negatives = np.array(negatives)

print("Shape anchors:", anchors.shape)
print("Shape positives:", positives.shape)
print("Shape negatives:", negatives.shape)

# Speicherort für Triplets (nur Vektoren, keine Namen)
np.save("triplets/triplets_anchor.npy", anchors)
np.save("triplets/triplets_positive.npy", positives)
np.save("triplets/triplets_negative.npy", negatives)


Embedding-Shape: (6450, 1024)
Shape anchors: (6450, 1024)
Shape positives: (6450, 1024)
Shape negatives: (6450, 1024)
