# Fine tuning - cosine similarity loss

In [1]:
!pip install -U sentence-transformers --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
!wget "https://raw.githubusercontent.com/CKeibel/language-alignment/main/translations.csv"

--2023-12-10 14:57:20--  https://raw.githubusercontent.com/CKeibel/language-alignment/main/translations.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1061386 (1.0M) [text/plain]
Saving to: ‘translations.csv’


2023-12-10 14:57:21 (25.4 MB/s) - ‘translations.csv’ saved [1061386/1061386]



## Dataset

In [3]:
import pandas as pd

df = pd.read_csv("translations.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,overview,len,ger
0,93837,So Undercover,When the FBI hires her to go undercover at a c...,381,"Als das FBI sie anheuert, um undercover in ein..."
1,8193,Napoleon Dynamite,A listless and alienated teenager decides to h...,184,Ein lustloser und entfremdeter Teenager beschl...
2,8195,Ronin,A briefcase with undisclosed contents -- sough...,337,Ein Aktenkoffer mit unbekanntem Inhalt - gesuc...
3,5,Four Rooms,It's Ted the Bellhop's first night on the job....,237,"Es ist die erste Nacht von Ted, dem Pagen... u..."
4,8202,Æon Flux,"400 years into the future, disease has wiped o...",311,400 Jahre in der Zukunft hat eine Krankheit de...


In [4]:
from sentence_transformers import InputExample

dataset = []

for (index, row) in df.iterrows():
    dataset.append(
        InputExample(
            texts=[row["overview"], row["ger"]],
            label=1.0
        )
    )

In [5]:
from sentence_transformers import SentencesDataset, SentenceTransformer

model = SentenceTransformer("distilbert-base-nli-mean-tokens")

train_dataset = SentencesDataset(dataset, model)

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [6]:
from torch.utils.data import DataLoader
from sentence_transformers import losses

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model=model)

In [7]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=5, warmup_steps=100)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/96 [00:00<?, ?it/s]

Iteration:   0%|          | 0/96 [00:00<?, ?it/s]

Iteration:   0%|          | 0/96 [00:00<?, ?it/s]

Iteration:   0%|          | 0/96 [00:00<?, ?it/s]

Iteration:   0%|          | 0/96 [00:00<?, ?it/s]

## Evaluation

In [14]:
import numpy as np
from numpy.linalg import norm


def cosine_similarity(v_en, v_ger) -> float:
    return np.dot(v_en, v_ger) / (norm(v_en) * norm(v_ger))

def calculate_similarity(en: str, ger: str, model) -> float:
    v_en = model.encode(en)
    v_ger = model.encode(ger)
    return cosine_similarity(v_en, v_ger)

In [15]:
from tqdm import tqdm

# Fine tuned

similarities = []

with tqdm(total=len(df)) as pbar:
    for (index, row) in df.iterrows():
        similarities.append(
            calculate_similarity(
                en=row["overview"],
                ger=row["ger"],
                model=model
            )
        )
        pbar.update(1)
np.mean(similarities)

100%|██████████| 1525/1525 [00:29<00:00, 51.35it/s]


0.99418426

In [17]:
# Original
original_model = SentenceTransformer("distilbert-base-nli-mean-tokens")

similarities = []

with tqdm(total=len(df)) as pbar:
    for (index, row) in df.iterrows():
        similarities.append(
            calculate_similarity(
                en=row["overview"],
                ger=row["ger"],
                model=original_model
            )
        )
        pbar.update(1)
np.mean(similarities)

100%|██████████| 1525/1525 [00:30<00:00, 50.45it/s]


0.48263708