# Embedding Optimization
We will apply embedding optimization techniques to improve the quality of our embeddings for better recommendation performance. While creating the model we had a lot of options to choose from. Here we will explore some of those options and see how they affect the model performance.

In [None]:
from helpers.notebook_pipelines.yes_tuned_bow_model import run_evaluation_multi
from helpers.functs.StudentProfile import StudentProfile
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from helpers.functs.NLP import soft_nlp, hard_nlp
from typing import Iterable, Set, Tuple
from IPython.display import display
import pandas as pd
import numpy as np
import ast

# Load dataset
hardNLP_df = pd.read_csv('../Data/Cleaned/cleaned_dataset_hard-NLP.csv')
softNLP_df = pd.read_csv('../Data/Cleaned/cleaned_dataset_soft-NLP.csv')

In [None]:
# Our mocked student profiles
student1 = StudentProfile(
    current_study= "Kunst & Onderzoek",
    interests=[
        "Tekening",
        "Animatie",
        "Kunst",
        "Artistiek",
        "Drama",
        "Het vermaken van mensen. Via zingen, dansen, toneel. Graag op het podium. Mensen betrekken bij kunst. Veel vrijheid en ruimte voor creativiteit."
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Carrière groei", "Sociale vaardigheden", "Zelfverzekerheid", "Vermaken"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)

student2 = StudentProfile(
    current_study= "Informatica",
    interests=[
        "Programmeren",
        "AI",
        "Coderen",
        "Techniek",
        "Software",
        "Werken met computers en techniek heb ik altijd interessant gevonden. Met de opkomst van kunstmatige intelligentie wil is dit ook iets waar ik me in wil gaan verdiepen. Zoals machine learning, reinforcement learning, etc."
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Carrière groei", "Multitasken", "Kritisch denken", "Technische Vaardigheden"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)

student3 = StudentProfile(
    current_study= "Psychologie",
    interests=[
        "Mensen",
        "Emoties",
        "Gedrag",
        "Psychologie",
        "Waarom mensen bepaalde dingen doen. Hun gedrag, persoonlijkheid, emoties, etc. Ook het toepassen van psychologie om het welzijn van mensen te bevorderen."
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Carrière groei", "Multitasken", "Communicatie vaardigheden", "Sociale Vaardigheden"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
    preferred_start_range="any"
)

# Ground-truth relevant modules per student
matching_models_list = [
    [388, 392, 191, 385, 386, 379, 389, 377, 233],          # student1
    [304, 305, 312, 317, 318, 322, 321, 334, 336, 340],     # student2
    [159, 290, 397, 180, 177]                               # student3
]

students = [student1, student2, student3]

In [5]:
# Combine relevant text columns 
big_string = (
    df["name"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["learningoutcomes"].fillna("") + " " +
    df["module_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
)

stringified_df = pd.DataFrame({
    "id": df["id"],
    "text": big_string
})

stringified_df.head()

Unnamed: 0,id,text
0,159,Kennismaking met Psychologie In deze module le...
1,160,Learning and working abroad Studenten kiezen b...
2,161,Proactieve zorgplanning Het Jeroen Bosch zieke...
3,162,Rouw en verlies In deze module wordt stil gest...
4,163,Acuut complexe zorg In deze module kunnen stud...


## Model Variations
We will create several variations of our embedding models by changing parameters such as:
- Embedding dimensions
- Context window size
- Minimum word frequency
- Training epochs

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")