# Setup

In [None]:
%pip install torch transformers sentence-transformers datasets

# Loading raw Dataset

In [None]:
import json

def load_json_data(filepath):
    try:
        with open(filepath, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{filepath}'.")
        return None

nodes = load_json_data("---")
dataset = []

exploration

In [None]:
nodes[0]
id = nodes[0].get("id")
descr = nodes[0].get("Description")

print(id)
print(descr)

for _ in nodes[0].keys():
    if type(nodes[0][_]) == dict:
        for __ in nodes[0][_].keys():
            if(__ == "text") and len(nodes[0][_][__]) > 0:
                print(f"{_} -> {__}: {nodes[0][_][__]}")

In [None]:
for node in nodes:
    id = node.get("id")
    descr = node.get("Description")

    dataset.append({
        "query": id,
        "document": descr
    })

    for _ in node.keys():
        if type(node[_]).__name__ == 'dict':
            for __ in node[_].keys():
                if(__ == "text") and len(node[_][__]) > 0:
                    dataset.append({
                        "query": id,
                        "document": node[_][__]
                    })

creation of augmented dataset

In [None]:
import pandas as pd

df = pd.DataFrame(dataset)
df.head(10)

df["document"] = df["document"].str.replace(r'\(.*?\)', '', regex=True)
df["document"] = df["document"].str.replace(r'\[.*?\]', '', regex=True)
df["document"] = df["document"].str.replace(r'\n', ' ', regex=True)
df["document"] = df["document"].str.replace(r'\s+', ' ', regex=True)

df.head(10)

# Storing the augmented dataset

In [None]:
df.to_json("---", orient="records")

# Loading the dataset from drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

original = pd.read_json("---")

In [None]:
import pandas as pd
import random

num_entries = 5
random_indices = random.sample(range(len(df)), num_entries)
random_entries = df.iloc[random_indices]

random_entries

# Data cleaning

In [None]:
df = original.copy()
df.count()

normalize & remove empty docs

In [None]:
df["document"] = df["document"].str.strip()
df["document"] = df["document"].str.lower()

df = df[df["document"].str.len() > 0]
df.count()

duplicates

In [None]:
duplicate_docs = df[df.duplicated(subset=['document'], keep=False)]
duplicate_docs = duplicate_docs.sort_values('document', key=lambda x: x.str.len())
duplicate_docs

remove duplicates entirely if under a size threshold

In [None]:
threshold = 40

shorter = duplicate_docs[duplicate_docs["document"].str.len() <= threshold]
df = df[~df["document"].isin(shorter["document"])]
df.count()

In [None]:
duplicate_docs.drop(shorter.index, inplace=True)
duplicate_docs

In [None]:
df[df.duplicated(subset=['document'], keep=False)]

find rank of queries  
\#number of documents for each query

In [None]:
query_counts = df.groupby("query")["document"].count()
query_counts = query_counts.sort_values(ascending=False)

df = df.join(query_counts, on="query", rsuffix="_count")
df = df.sort_values("document_count", ascending=False)

df.info()

In [None]:
long_ctx = df[df['query'].isin(duplicate_docs["query"])]
long_ctx[["query", "document_count"]].drop_duplicates().sort_values("document_count", ascending=False)

In [None]:
df.drop_duplicates(subset=['document'], inplace=True, keep='first')
df = df[["query", "document"]].sort_values("query")
df.info()

In [None]:
import re
def has_html(text):
    return bool(re.search(r'<[^>]+>', text))

def remove_html(text):
    return re.sub(r'<[^>]+>', '', text)

df["has_html"] = df["document"].apply(has_html)
df[df["has_html"] == True]

In [None]:
dirty = df[df["has_html"]]
dirty

In [None]:
dirty_indices = df[df["document"].isin(dirty["document"])].index
df.loc[dirty_indices, "document"] = df.loc[dirty_indices, "document"].apply(remove_html)
df[df["document"].isin(dirty["document"])]

In [None]:
df = df.drop(columns=["has_html"])

In [None]:
df

In [None]:
df.to_json("---", orient="records")

# Data augmentation

creating queries starting from multiple-word queries
- changing the order of name, surname
- splitting names into different queries
- nicknames
- splitting composite words

In [None]:
import random

def introduce_typo(word):
    typo_type = random.choice(["substitution", "insertion", "deletion", "transposition"])
    idx = random.randint(0, len(word) - 1)

    if typo_type == "substitution":
        word = word[:idx] + random.choice("abcdefghijklmnopqrstuvwxyz") + word[idx+1:]

    elif typo_type == "insertion":
        word = word[:idx] + random.choice("abcdefghijklmnopqrstuvwxyz") + word[idx:]

    elif typo_type == "deletion" and len(word) > 1:
        word = word[:idx] + word[idx+1:]

    elif typo_type == "transposition" and len(word) > 1:
        if idx < len(word) - 1:
            word = word[:idx] + word[idx+1] + word[idx] + word[idx+2:]

    return word

query = "Monogatari series"
for _ in range(5):
    print(introduce_typo(query))

### loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

cleaned = pd.read_json("---")

In [None]:
import pandas as pd

cleaned = pd.read_json("---")

In [None]:
space_queries = cleaned[cleaned["anchor"].str.contains(" ")]
composite = cleaned[cleaned["anchor"].str.contains("-")]
parenthesis = cleaned[cleaned["anchor"].str.contains("\\(")]

merged = pd.concat([space_queries, composite,parenthesis])
merged.drop_duplicates(subset=['anchor'], inplace=True)
merged.info()

get the longest prompt for each example

In [None]:
merged = merged.sort_values('anchor')
idx = merged.groupby('anchor')['positive'].apply(lambda x: x.str.len().idxmax())
merged = merged.loc[idx]
merged

find examples with subtitles

In [None]:
subtitle = cleaned[cleaned["anchor"].str.contains(':\\s\\w+\\s*\\w*', regex=True)]
subtitle = subtitle.drop_duplicates(subset=['anchor'])
subtitle

In [None]:
subtitle = cleaned['anchor'].str.extract(r':\s(\w+\s*\w*)')
subtitle.columns = ['anchor']

subtitle.dropna(inplace=True)
subtitle

merge both

In [None]:
merged = merged.join(subtitle[['anchor']], how='left', rsuffix='_other')

if 'anchor' in merged.columns and 'anchor_other' in merged.columns:
    merged['anchor'] = merged['anchor_other'].fillna(merged['anchor'])
    merged = merged.drop(columns=['anchor_other'])

remove weird characters

In [None]:
merged.loc[:, 'anchor'] = merged['anchor'].apply(lambda x: x.replace('-', ' '))
merged['anchor'] = merged['anchor'].str.replace(r'\([^)]*\)', '', regex=True)
merged = merged[~merged['anchor'].str.contains(r'\d+')]

In [None]:
merged = merged.drop_duplicates(subset=['anchor'])
merged = merged.sort_values('anchor')
merged

find examples that are probably names

In [None]:
two_words = cleaned[cleaned['anchor'].apply(lambda x: len(x.split()) == 2)]
two_words

switch order of words

In [None]:
two_words.loc[two_words.index, "original"] = two_words.loc[two_words.index,"anchor"]
two_words

In [None]:
# drop values of 'negative' column
two_words = two_words.drop(columns=['negative'])
two_words

In [None]:
two_words_switched = two_words.copy()
two_words_switched['anchor'] = two_words_switched['anchor'].str.split().apply(lambda x: ' '.join(x[::-1]))
two_words_switched

In [None]:
two_words_drop_first = two_words.copy()
two_words_drop_first['anchor'] = two_words_drop_first['anchor'].str.split().apply(lambda x: ' '.join(x[1:]))
two_words_drop_first

In [None]:
two_words_drop_second = two_words.copy()
two_words_drop_second['anchor'] = two_words_drop_second['anchor'].str.split().apply(lambda x: ' '.join(x[:-1]))
two_words_drop_second

In [None]:
two_words_augmented = pd.concat([two_words, two_words_switched, two_words_drop_first, two_words_drop_second])
two_words_augmented = two_words_augmented.reset_index(drop=True)
two_words_augmented

In [None]:
two_words_augmented.duplicated(subset=['anchor', "positive"]).sum()

In [None]:
two_words_augmented.to_json("---", orient="records")

merge with other oversampled examples

In [None]:
two_words_augmented = pd.read_json("---")
two_words_augmented

In [None]:
merged = pd.concat([merged, two_words_augmented])
merged = merged.reset_index(drop=True)
merged

In [None]:
merged.sort_values('anchor').sample(30)

In [None]:
# merge with cleaned
augmented = pd.concat([merged])
augmented = augmented.reset_index(drop=True)
augmented

In [None]:
# introduce typos in the anchor
typos = pd.concat([augmented])
typos['anchor'] = typos['anchor'].apply(introduce_typo)
typos

In [None]:
augmented = pd.concat([typos])
augmented = augmented.reset_index(drop=True)
augmented

In [None]:
# save json to drive
augmented.to_json("---", orient="records")

In [None]:
# save json to drive
augmented.to_json("---", orient="records")

# Training

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

augmented = pd.read_json("---")
cleaned = pd.read_json("---")

data = pd.concat([cleaned, augmented])
data = data.reset_index(drop=True)

In [None]:
import pandas as pd

augmented = pd.read_json("---")
cleaned = pd.read_json("---")

data = pd.concat([cleaned, augmented])
data = data.reset_index(drop=True)

In [None]:
data[data.duplicated()]

In [None]:
data.drop_duplicates(inplace=True)
data

In [None]:
# lowercase everything
data['anchor'] = data['anchor'].str.lower()
data['positive'] = data['positive'].str.lower()
data['negative'] = data['negative'].str.lower()

In [None]:
data.to_json("---", orient="records")

In [None]:
import pandas as pd

data = pd.read_json("---")

In [None]:
from sklearn.model_selection import train_test_split

def load_and_split(data, validation_size=0.2, random_state=42):
    """
    Loads triplet data from JSON files using pandas, splits it into training and validation sets.

    Args:
        json_file_paths (list): List of paths to JSON files.
        validation_size (float): Proportion of data to use for validation.
        random_state (int): Random state for reproducible splitting.

    Returns:
        tuple: (train_triplets, val_triplets)
    """

    combined_df = pd.concat([data], ignore_index=True)

    train_df, val_df = train_test_split(combined_df, test_size=validation_size, random_state=random_state)

    def create_triplets_from_dataframe(df):
        anchors = df['anchor'].tolist()
        positives = df['positive'].tolist()
        negatives = df['negative'].tolist()
        return anchors, positives, negatives

    train_triplets = create_triplets_from_dataframe(train_df)
    val_triplets = create_triplets_from_dataframe(val_df)

    return train_triplets, val_triplets

train_triplets, val_triplets = load_and_split(data)

train_anchors, train_positives, train_negatives = train_triplets
val_anchors, val_positives, val_negatives = val_triplets

print(f"Number of training triplets: {len(train_anchors)}")
print(f"Number of validation triplets: {len(val_anchors)}")

print(train_anchors[0])
print(train_positives[0])
print(train_negatives[0])

In [None]:
%pip install scikit-learn sentence-transformers torch datasets wandb einops

In [None]:
import wandb
from sentence_transformers import SentenceTransformer, losses, InputExample, util
from torch.utils.data import DataLoader
import torch
from datasets import Dataset
import os

wandb.login()

In [None]:
def evaluate_triplets(model, anchors, positives, negatives):
    """Evaluates triplet data. Prints the mean cosine similarity of the positive and negative pairs."""
    positive_similarities = []
    negative_similarities = []

    for anchor, positive, negative in zip(anchors, positives, negatives):
        anchor_emb = model.encode(anchor)
        positive_emb = model.encode(positive)
        negative_emb = model.encode(negative)

        positive_similarities.append(util.cos_sim(anchor_emb, positive_emb).item())
        negative_similarities.append(util.cos_sim(anchor_emb, negative_emb).item())

    print(f"Mean positive similarity: {sum(positive_similarities) / len(positive_similarities)}")
    print(f"Mean negative similarity: {sum(negative_similarities) / len(negative_similarities)}")

def finetune_triplet_model_old(model_name, train_triplets, val_triplets=None,
                           epochs=1, batch_size=16, save_path='fine-tuned-triplet-model'):
    """
    Args:
        model_name (str): Name of the pre-trained model.
        train_triplets (tuple): Tuple of (anchors, positives, negatives) for training.
        val_triplets (tuple, optional): Tuple of (anchors, positives, negatives) for validation.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        save_path (str): Path to save the fine-tuned model.
    """

    model = SentenceTransformer(model_name)
    train_anchors, train_positives, train_negatives = train_triplets

    train_data = list(zip(train_anchors, train_positives, train_negatives))
    train_examples = [InputExample(texts=list(triplet)) for triplet in train_data]

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.TripletLoss(model=model)
    
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=len(train_dataloader) // 10,  # 10% of data for warm-up
        )

    model.save(save_path)
    print(f"Fine-tuned model saved to {save_path}")

    if val_triplets:
        val_anchors, val_positives, val_negatives = val_triplets
        evaluate_triplets(model, val_anchors, val_positives, val_negatives)

In [None]:
def evaluate_triplets(model, anchors, positives, negatives):
    """Evaluates triplet data. Prints the mean cosine similarity of the positive and negative pairs."""
    positive_similarities = []
    negative_similarities = []

    for anchor, positive, negative in zip(anchors, positives, negatives):
        anchor_emb = model.encode(anchor)
        positive_emb = model.encode(positive)
        negative_emb = model.encode(negative)

        positive_similarities.append(util.cos_sim(anchor_emb, positive_emb).item())
        negative_similarities.append(util.cos_sim(anchor_emb, negative_emb).item())

    print(f"Mean positive similarity: {sum(positive_similarities) / len(positive_similarities)}")
    print(f"Mean negative similarity: {sum(negative_similarities) / len(negative_similarities)}")

def finetune_triplet_model(model_name, train_triplets, val_triplets=None,
                           epochs=1, batch_size=16, save_path='fine-tuned'):
    """
    Args:
        model_name (str): Name of the pre-trained model.
        train_triplets (tuple): Tuple of (anchors, positives, negatives) for training.
        val_triplets (tuple, optional): Tuple of (anchors, positives, negatives) for validation.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        save_path (str): Path to save the fine-tuned model.
    """

    # Determine the starting epoch
    start_epoch = 0
    if os.path.exists(save_path):
        # Check for existing checkpoints
        checkpoint_files = [f for f in os.listdir(save_path) if f.startswith('checkpoint_epoch_')]
        if checkpoint_files:
            # Extract epoch numbers and find the latest
            epoch_numbers = [int(f.split('_')[-1]) for f in checkpoint_files]
            start_epoch = max(epoch_numbers)
            print(f"Resuming training from epoch {start_epoch + 1}")
    
    model = SentenceTransformer(save_path if start_epoch > 0 else model_name)
    model.to('cuda')
    train_anchors, train_positives, train_negatives = train_triplets

    train_data = list(zip(train_anchors, train_positives, train_negatives))
    train_examples = [InputExample(texts=list(triplet)) for triplet in train_data]

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.TripletLoss(model=model)
    
    for epoch in range(start_epoch, epochs):
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=1,  # Train for only 1 epoch at a time
            warmup_steps=len(train_dataloader) // 10,  # 10% of data for warm-up
        )

        # Save checkpoint after each epoch
        checkpoint_path = os.path.join(save_path, f'checkpoint_epoch_{epoch + 1}')
        model.save(checkpoint_path)
        print(f"Checkpoint saved to {checkpoint_path}")

    # Save the final model
    model.save(save_path)
    print(f"Fine-tuned model saved to {save_path}")

    if val_triplets:
        val_anchors, val_positives, val_negatives = val_triplets
        evaluate_triplets(model, val_anchors, val_positives, val_negatives)

In [None]:
finetune_triplet_model('all-MiniLM-L6-v2', train_triplets, val_triplets, epochs=4, batch_size=8, save_path='all-MiniLM-L6-v2')

loaded_model = SentenceTransformer('all-MiniLM-L6-v2/checkpoint_epoch_4/')
embeddings = loaded_model.encode(["Koyomi Araragi", "Araragi"])
print(embeddings)

In [None]:
# find the cosine similarity between two sentences
def cosine_similarity(model, sentence1, sentence2):
    embeddings = model.encode([sentence1, sentence2])
    return util.cos_sim(embeddings[0], embeddings[1])

In [None]:
miniLM = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", trust_remote_code=True, device="cuda")
evaluate_triplets(miniLM, val_anchors, val_positives, val_negatives)

In [None]:
finetuned_miniLM = SentenceTransformer("all-MiniLM-L6-v2/checkpoint_epoch_4/", device="cuda")
evaluate_triplets(finetuned_miniLM, val_anchors, val_positives, val_negatives)

In [None]:
mpnet = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", trust_remote_code=True, device="cuda")
evaluate_triplets(mpnet, val_anchors, val_positives, val_negatives)

In [None]:
finetune_triplet_model('all-mpnet-base-v2', train_triplets, val_triplets, epochs=4, batch_size=2, save_path='all-mpnet-base-v2')

In [None]:
finetuned_mpnet = SentenceTransformer("all-mpnet-base-v2/checkpoint_epoch_4", device="cuda")
evaluate_triplets(finetuned_mpnet, val_anchors, val_positives, val_negatives)

In [None]:
cosine_similarity(finetuned_mpnet, "Koyomi Araragi", "Araragi")

In [None]:
cosine_similarity(finetuned_mpnet, "Araragi Matriarch", "Araragi")

In [None]:
cosine_similarity(finetuned_miniLM, "Koyomi Araragi", "Araragi")

In [None]:
cosine_similarity(finetuned_miniLM, "Araragi Karen", "Araragi")

In [None]:
vec = finetuned_mpnet.encode("Koyomi Araragi")

In [None]:
import requests

payload = {
  'id': "Koyomi Araragi",
  'vec': vec.tolist()
}

# response = requests.post('http://localhost:5173/api/db', json=payload)

In [None]:
keys = requests.get('http://localhost:5173/api/data/all')
keys.json()

In [None]:
for key in keys.json():
    vec = finetuned_mpnet.encode(key).tolist()
    response = requests.post('http://localhost:5173/api/db', json={'id': key, 'vec': vec})
    print(f'key: {key}, response: {response.text}')

In [None]:
for key in keys.json():
    vec = finetuned_mpnet.encode(key).tolist()
    response = requests.post('http://localhost:5173/api/db/query', json={'vec': vec, 'threshold': 0.8})
    print(f'key: {key}, response: {response.json()}', end='\n\n')

In [None]:
vec = finetuned_mpnet.encode(f"Koyomi seems to assume that something worse is hidden in the darkness").tolist()
response = requests.post('http://localhost:5173/api/db/query', json={'vec': vec, 'threshold': 0.8})
print(response.json())

In [None]:
list(response.json())

In [None]:
import pandas as pd

similarities = pd.read_json("---")
similarities

In [None]:
similarities.sample(10).to_dict(orient='records')

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, evaluation, losses
from torch.utils.data import DataLoader

# 1. Load and prepare data
data = similarities.to_dict(orient='records')

eval_examples = [InputExample(texts=[item['id1'], item['id2']], label=item['score']) for item in data]

# 2. Create evaluator
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name='similarity-eval')

# 3. Create Model
model = SentenceTransformer('all-MiniLM-L6-v2/checkpoint_epoch_4/', device='cuda')

# 4. Create Loss Function
train_loss = losses.CosineSimilarityLoss(model)

# 5. Create DataLoader
train_dataloader = DataLoader(eval_examples, shuffle=True, batch_size=8)

# 6. Train Model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    evaluator=evaluator,
    evaluation_steps=1000
)

model.save("similarity-model_all-MiniLM-L6-v2")

# Example of using the model
embeddings = model.encode(["Nisemonogatari", "Rainy Devil"])
print(embeddings)