# Embedding Optimization
We will apply embedding optimization techniques to improve the quality of our embeddings for better recommendation performance. While creating the model we had a lot of options to choose from. Here we will explore some of those options and see how they affect the model performance.

In [8]:
from helpers.notebook_pipelines.yes_tuned_bow_model import run_evaluation_multi
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from helpers.functs.StudentProfile import StudentProfile
from sklearn.metrics.pairwise import cosine_similarity
from typing import Iterable, Set, Tuple, List
from IPython.display import display
import pandas as pd
import numpy as np
import torch
import ast

In [9]:
# Our mocked student profiles
student1 = StudentProfile(
    current_study= "Kunst & Onderzoek",
    interests=[
        "Tekenen",
        "Animatie",
        "Kunst",
        "Drama",
        "Ik hou ervan om mensen te vermaken. Dit doe ik het liefst door bijvoorbeeld te dansen, te zingen of toneelspelen. In het algemeen vind ik het fijn om creatief bezig te zijn. Ik ben graag onder de mensen."
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Kritisch denken", "Sociale vaardigheden", "Zelfverzekerdheid", "Ik wil o.a. leren hoe ik betere illustraties kan maken zowel analoog als digitaal."],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)
student2 = StudentProfile(
    current_study= "Informatica",
    interests=[
        "Programmeren",
        "AI",
        "Coderen",
        "Techniek",
        "Software",
        "Ik heb werken met computers en techniek van jongs af aan al interessant gevonden. Kunstmatige intelligentie is nu ook zeker iets dat me interesseert, zeker met de sterke opkomst hiervan. Zo kun je denken aan machine learning, deep learning, etc. Ik wil me vooral bezighouden met software."
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Kritisch denken", "Technische Vaardigheden", "Eén van de dingen die ik wil leren is het toepassen van mijn kennis in de praktijk"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)
student3 = StudentProfile(
    current_study= "Psychologie",
    interests=[
        "Mensen",
        "Emoties",
        "Gedrag",
        "Psychologie",
        "Ik wil graag weten waarom mensen bepaalde dingen doen; ik wil mensen hun gedrag kunnen begrijpen. Naast hun gedrag wil ik ook leren over, persoonlijkheden en emoties. Graag wil ik mensen hun welzijn kunnen bevorderen met het gebruik van psychologische kennis."
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Kritisch denken", "Sociale Vaardigheden", "Ik wil leren hoe ik mensen en hun gedrag beter kan analyseren"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)

student4 = StudentProfile(
    current_study="Economics",
    interests=[
        "Business",
        "Entrepreneurship",
        "Strategies",
        "Branding",
        "I am a real entrepreneur. I am especially interested in how innovation, strategy, and future developments influence economic growth and organizational success"
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Forward thinking", "Social skill", "I want to better understand the impact of modern technologies on business models"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)

student5 = StudentProfile(
    current_study="Verpleegkunde, Mens en Techniek",
    interests=[
        "Zorg",
        "Gezondheid",
        "Medisch",
        "Verpleegkunde",
        "Ik ben erg geïnteresseerd in verpleegkunde en hoop ook later in een ziekenhuis omgeving te kunnen werken. Daarom wil ik ook meer praktische ervaring op doen passend bij mijn studie verpleegkunde met de module die ik kies."
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Sociale vaardigheden", "Persoonlijke ontwikkeling", "Ik leren hoe ik beter kan anticiperen op bepaalde zorgbehoeften"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)

students = [student1, student2, student3, student4, student5]

In [10]:
softNLP_df = pd.read_csv("../Data/Cleaned/cleaned_dataset_soft-NLP.csv")
hardNLP_df = pd.read_csv("../Data/Cleaned/cleaned_dataset_hard-NLP.csv")

# safely join selected columns into text (handle NaNs and non-str types)
cols = ['name', 'description', 'learningoutcomes', 'module_tags']

softNLP_module_text = (
    softNLP_df[cols]
    .fillna('')
    .astype(str)
    .agg(' '.join, axis=1)
    .str.replace(r'\s+', ' ', regex=True)
    .tolist()
)
# use the dataset 'id' column so we report the real module ids (not dataframe positional indices)
soft_module_ids = softNLP_df['id'].tolist()

hardNLP_module_text = (
    hardNLP_df[cols]
    .fillna('')
    .astype(str)
    .agg(' '.join, axis=1)
    .str.replace(r'\s+', ' ', regex=True)
    .tolist()
)
hard_module_ids = hardNLP_df['id'].tolist()

# keep texts and their original ids together so we compute top-k per corpus
nlp_options = {
    "soft": (softNLP_module_text, soft_module_ids),
    "hard": (hardNLP_module_text, hard_module_ids)
}

In [11]:
model_options = {
    "Model1": SentenceTransformer("all-MiniLM-L6-v2"),
    "Model2": SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2"),
    "Model3": SentenceTransformer("distiluse-base-multilingual-cased-v2")
}

In [12]:
top_k = 5

## Variations
We will create several variations of our embedding models by changing parameters such as:
- Model architecture
- Preprocessing techniques
- Fine-tuning strategies

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [14]:
results = {}

for s_idx, student in enumerate(students, start=1):
    query_text = student.to_text()  # cleaned student text

    results[f"student{s_idx}"] = {}

    for nlp_name, (module_texts, module_ids) in nlp_options.items():
        results[f"student{s_idx}"][nlp_name] = {}

        for model_name, model in model_options.items():
            # Encode modules
            module_vectors = model.encode(module_texts, convert_to_tensor=True, device=device) # Use GPU if available
            # Encode student profile
            query_vector = model.encode([query_text], convert_to_tensor=True, device=device) # Use GPU if available
            # Cosine similarity
            scores = util.cos_sim(query_vector, module_vectors)[0].cpu().numpy()
            # Get top-k matches (map positional indices back to original module ids)
            top_indices = scores.argsort()[::-1][:top_k]
            top_matches = [(module_ids[i], float(scores[i])) for i in top_indices]  # module_ids map to original dataframe index
            results[f"student{s_idx}"][nlp_name][model_name] = top_matches

In [15]:
for student_name, student_results in results.items():
    print(f"\n===== {student_name} =====")
    for nlp_name, nlp_results in student_results.items():
        print(f"\n-- {nlp_name} --")
        for model_name, matches in nlp_results.items():
            print(f"{model_name}: {matches}")


===== student1 =====

-- soft --
Model1: [(191, 0.6879173517227173), (391, 0.6761540174484253), (392, 0.6016901135444641), (233, 0.6006746888160706), (386, 0.5994068384170532)]
Model2: [(191, 0.7134237289428711), (375, 0.6478080749511719), (388, 0.6065810918807983), (379, 0.6056426763534546), (391, 0.6036763191223145)]
Model3: [(191, 0.5802501440048218), (233, 0.46715909242630005), (391, 0.45081523060798645), (377, 0.4406433701515198), (387, 0.3975970447063446)]

-- hard --
Model1: [(392, 0.5892460942268372), (391, 0.576168417930603), (191, 0.5637333989143372), (393, 0.5525187253952026), (379, 0.5389760136604309)]
Model2: [(389, 0.6092264652252197), (391, 0.5959000587463379), (379, 0.5894417762756348), (386, 0.555540919303894), (233, 0.553278386592865)]
Model3: [(191, 0.44343459606170654), (391, 0.43817785382270813), (389, 0.3870799243450165), (377, 0.3850248456001282), (379, 0.3626529574394226)]

===== student2 =====

-- soft --
Model1: [(333, 0.7170849442481995), (366, 0.69605636596

## presision@k Evaluation
We will evaluate the performance of each variation using precision@k metric to determine which configuration yields the best results.

In [16]:
# Ground-truth relevant modules per student
ground_truth = {
    "student1": [388, 392, 191, 385, 386, 379, 389, 377, 391, 233],
    "student2": [304, 305, 312, 317, 318, 322, 321, 334, 336, 340, 333],
    "student3": [159, 290, 397, 180, 177, 208, 173, 193, 357],
    "student4": [229, 272, 280, 279, 235, 380],
    "student5": [160]
}

In [17]:
def precision_at_k(top_matches, k, relevant_set):
    """
    top_matches: list of (index, score)
    relevant_set: set of ground truth module indices
    """
    top_indices = [idx for idx, _ in top_matches[:k]]
    relevant_in_top_k = sum(1 for idx in top_indices if idx in relevant_set)
    return relevant_in_top_k / k

# Compute Precision@k for all students
precision_results = {}

for student_name, student_results in results.items():
    precision_results[student_name] = {}
    for nlp_name, module_results in student_results.items():
        precision_results[student_name][nlp_name] = {}
        for model_name, top_matches in module_results.items():
            # ground_truth keys use lowercase (e.g. 'student1') while results use 'Student1'
            gt_key = student_name.lower() if student_name.lower() in ground_truth else student_name
            relevant_set = set(ground_truth.get(gt_key, []))
            precision = precision_at_k(top_matches, top_k, relevant_set)
            precision_results[student_name][nlp_name][model_name] = precision

# Display
for student_name, student_precisions in precision_results.items():
    print(f"\n===== {student_name} =====")
    for corpus_name, corpus_precisions in student_precisions.items():
        print(f"\n-- {corpus_name.upper()} NLP --")
        for model_name, prec in corpus_precisions.items():
            print(f"{model_name}: Precision@{top_k} = {prec:.2f}")


===== student1 =====

-- SOFT NLP --
Model1: Precision@5 = 1.00
Model2: Precision@5 = 0.80
Model3: Precision@5 = 0.80

-- HARD NLP --
Model1: Precision@5 = 0.80
Model2: Precision@5 = 1.00
Model3: Precision@5 = 1.00

===== student2 =====

-- SOFT NLP --
Model1: Precision@5 = 0.40
Model2: Precision@5 = 0.60
Model3: Precision@5 = 0.60

-- HARD NLP --
Model1: Precision@5 = 0.40
Model2: Precision@5 = 0.40
Model3: Precision@5 = 0.60

===== student3 =====

-- SOFT NLP --
Model1: Precision@5 = 0.40
Model2: Precision@5 = 0.40
Model3: Precision@5 = 0.60

-- HARD NLP --
Model1: Precision@5 = 0.60
Model2: Precision@5 = 0.80
Model3: Precision@5 = 0.60

===== student4 =====

-- SOFT NLP --
Model1: Precision@5 = 0.60
Model2: Precision@5 = 0.20
Model3: Precision@5 = 0.40

-- HARD NLP --
Model1: Precision@5 = 0.00
Model2: Precision@5 = 0.20
Model3: Precision@5 = 0.40

===== student5 =====

-- SOFT NLP --
Model1: Precision@5 = 0.00
Model2: Precision@5 = 0.20
Model3: Precision@5 = 0.20

-- HARD NLP --
M