# Model Training Sentence Embedding

In [1]:
from helpers.functs.StudentProfile import StudentProfile
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from helpers.functs.NLP import soft_nlp, hard_nlp
from typing import Iterable, Set, Tuple
import pandas as pd
import numpy as np
import ast

# Load dataset
df = pd.read_csv('../Data/Cleaned/cleaned_dataset_soft-NLP.csv')

# Loading uncleaned dataset for feedback names, etc. that have not seen NLP for user friendliness
raw_df = pd.read_csv('../Data/Raw/Uitgebreide_VKM_dataset.csv')

Some of the first steps of preparing the data will be the same as we did in the training of the BOW model. This is why we won't explain all those steps again in this notebook.

## 0. Mocking a student profile (Copy of 3.1)

In [2]:
student = StudentProfile(
    current_study= "Kunst & Onderzoek",
    interests=[
        "Tekening",
        "Animatie",
        "Kunst",
        "Artistiek",
        "Het vermaken van mensen. Via zingen, dansen, toneel. Graag op het podium. "
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Carrière groei", "Sociale vaardigheden", "Zelfverzekerheid", "Vermaken"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
)

matching_models = [388, 392, 191, 385, 386, 379, 389, 377, 233]

Creating a filtered dataset. Copy of the dataset used for comparison later

In [3]:
# Create filtered module and save. The filtered one won't be used by TF-IDF because that would create bias. (Smaller amount of modules compared > easier higher scores)
filtered_df = df.copy()

# Helper to normalize the list-like location strings such as "['Den Bosch', 'Tilburg']"
def normalize_locations(series):
    def _to_list(val):
        try:
            parsed = ast.literal_eval(str(val))
            if isinstance(parsed, list):
                return [str(x).strip().lower() for x in parsed]
            return [str(parsed).strip().lower()]
        except Exception:
            return [str(val).strip().lower()]
    return series.apply(_to_list)

# --- 1. Study credits range ---
if hasattr(student, "wanted_study_credit_range") and student.wanted_study_credit_range is not None:
    min_cred, max_cred = student.wanted_study_credit_range
    filtered_df = filtered_df[(filtered_df["studycredit"] >= min_cred) & (filtered_df["studycredit"] <= max_cred)]

# --- 2. Location preference ---
if hasattr(student, "location_preference") and student.location_preference:
    all_locs_filtered = normalize_locations(filtered_df["location"])
    loc_prefs_norm = [str(x).strip().lower() for x in student.location_preference]
    loc_mask = all_locs_filtered.apply(lambda lst: any(x in loc_prefs_norm for x in lst))
    filtered_df = filtered_df[loc_mask]

# --- 3. Language of the module vs preferred language of the student ---
# Pretty complicated to include and won't be of any use anyways since tf-idf won't be able to link interests written in difference language than de modules

# --- 4. Level preference (e.g. NLQF levels) ---
if hasattr(student, "level_preference") and student.level_preference:
    level_prefs = [str(x).strip().lower() for x in student.level_preference]
    filtered_df = filtered_df[filtered_df["level"].astype(str).str.lower().isin(level_prefs)]

# --- 5. Availability > 0 ---
filtered_df = filtered_df[filtered_df["available_spots"] > 0]

print(f"Original number of modules: {len(df)}")
print(f"Number of modules after filtering: {len(filtered_df)}")

Original number of modules: 211
Number of modules after filtering: 211


# 1. Combining Relevant Text Columns of Modules Dataset

In [4]:
# # Combine relevant text columns 
# big_string = (
#     df["name"].fillna("") + " " +
#     df["description"].fillna("") + " " +
#     df["learningoutcomes"].fillna("") + " " +
#     df["module_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
# )

# stringified_df = pd.DataFrame({
#     "id": df["id"],
#     "text": big_string
# })

# stringified_df.head()

## 2. Vectorizing dataset
This time we'll be using sentence embedding for our vectorization. We selected SBERT with the multilingual model paraphrase-multilingual-MiniLM-L12-v2 to handle both Dutch and English inputs and content. This model provides strong semantic understanding, allowing it to capture the meaning of student profiles and course descriptions very well. It also embeds both languages into the same vector space, enabling accurate cross-language comparisons.

In [5]:
# # Loading sentence model
# model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

In [6]:
# # Encode big_df text with sentence embeddings
# big_df_embeddings = model.encode(stringified_df["text"].tolist(), show_progress_bar=True)
# big_df_embeddings = np.array(big_df_embeddings)
# big_df_embeddings.shape

In [7]:
# sentence_embedded_dataframe = pd.DataFrame({
#     "id": stringified_df["id"],
#     "sentence_embedding_vector": list(big_df_embeddings)  
# })
# sentence_embedded_dataframe.head()


## 3. Exporting Dataset
For tf-idf we just ran the whole notebook everytime since it only takes 0.8s. However now more computations are needed and it takes around 5 seconds so we decided to export it so it doesn't have to be calculated everytime. 


In [8]:
# output_csv = '../Data/Vectorized/sentence_embedded_dataframe.csv'
# sentence_embedded_dataframe.to_csv(output_csv, index=False)
# print('Saved dataframe to', output_csv)


## 3.1 Importing the Exported Dataset
After performing sentence embedding the final shape was (211, 768)

In [9]:
embedded_modules = pd.read_csv('../Data/Vectorized/sentence_embedded_dataframe.csv')
embedded_modules.head()

Unnamed: 0,id,sentence_embedding_vector
0,159,[-2.54321918e-02 -9.37866718e-02 -1.48883555e-...
1,160,[-1.20328344e-01 -1.31259754e-01 -9.02268570e-...
2,161,[-6.43099099e-02 -1.01500481e-01 -1.21317105e-...
3,162,[ 4.75589335e-02 -1.63877644e-02 -1.77945886e-...
4,163,[-6.94717616e-02 -2.24891558e-01 -1.35854846e-...


# 4. Stringifying Student input and then Soft NLP

In [10]:
studentInterests = student.to_text()
student_softNLP = soft_nlp(studentInterests)

student_softNLP

'tekening animatie kunst artistiek het vermaken van mensen via zingen dansen toneel graag op het podium carrière groei sociale vaardigheden zelfverzekerheid vermaken kunst onderzoek'

## 5. Vectorization Student Input
Instead of using TF-IDF like we did earlier, now we will use 'paraphrase-multilingual-mpnet-base-v2' on the student input.

In [11]:
# Loading sentence model
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Encode big_df text with sentence embeddings
embedded_student_input = model.encode(student_softNLP, show_progress_bar=True)
print(embedded_student_input.shape)
embedded_student_input

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(768,)


array([-2.15667509e-03,  8.62051323e-02, -1.45371249e-02, -2.82322355e-02,
        9.05091017e-02,  5.97568601e-03,  2.64076274e-02, -3.64896879e-02,
       -5.46742827e-02,  1.10205524e-01, -5.98061271e-02,  2.75395848e-02,
        2.21808478e-02,  1.09577157e-01,  4.17559706e-02, -2.58725643e-01,
       -5.47130182e-02,  1.03192590e-01,  4.91523817e-02, -1.40687050e-02,
        1.13768382e-02, -6.16270155e-02, -9.92785674e-03,  6.74840137e-02,
       -1.52886987e-01, -1.33879811e-01,  5.68890646e-02,  1.70402918e-02,
        1.55644491e-01, -3.60830897e-03, -3.00376602e-02,  7.01488554e-02,
        1.45415291e-02,  1.12187073e-01,  2.64441613e-02,  4.55813967e-02,
        8.01191106e-03, -1.41162192e-02,  4.62133400e-02, -7.60136265e-03,
        7.33392835e-02,  1.67597681e-01, -4.08982933e-02,  5.87481223e-02,
       -9.86605808e-02,  1.11482762e-01,  3.88785563e-02, -8.37358385e-02,
        1.81266204e-01, -7.15325028e-03,  4.04718928e-02, -4.21043038e-02,
       -6.09739982e-02,  

## 6. Running cosine similarity on both matrixes
We have now vectorized both the student input and our modules data. Now we are able to run cosine similarity between them and look at the recommendations.

In [12]:



def recommend_with_sentence_embeddings(
    embedded_modules: pd.DataFrame,
    embedded_student_input,
    filtered_df: pd.DataFrame,
    raw_df: pd.DataFrame,
    top_n: int = 5,
) -> pd.DataFrame:
    """Generate top-N module recommendations using sentence-embedding cosine similarity.

    Parameters
    ----------
    embedded_modules : pd.DataFrame
        DataFrame loaded from the sentence-embedding export. Must contain
        columns 'id' (module id) and 'sentence_embedding_vector' (list/str).
    embedded_student_input : np.ndarray or list
        Vector representation of the current student (1D or 2D).
    filtered_df : pd.DataFrame
        Filtered modules DataFrame used to restrict candidate set by rules
        such as credits, location, level, availability, etc. Must contain 'id'.
    raw_df : pd.DataFrame
        Original (uncleaned) modules DataFrame used to look up human-readable
        module names by 'id'. Must contain 'id' and ideally 'name'.
    top_n : int, optional
        Number of top recommendations to return, by default 5.

    Returns
    -------
    pd.DataFrame
        DataFrame with columns: 'rank', 'module_id', 'module_name', 'score'.
    """

    if "sentence_embedding_vector" not in embedded_modules.columns:
        raise KeyError("embedded_modules must contain 'sentence_embedding_vector' column.")
    if "id" not in embedded_modules.columns:
        raise KeyError("embedded_modules must contain 'id' column.")
    if "id" not in filtered_df.columns:
        raise KeyError("filtered_df must contain 'id' column.")
    if "id" not in raw_df.columns:
        raise KeyError("raw_df must contain 'id' column.")

    # Parse stored vectors if they are stringified
    embedded_modules = embedded_modules.copy()
    embedded_modules["sentence_embedding_vector"] = embedded_modules[
        "sentence_embedding_vector"
    ].apply(
        lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x)
    )

    # Build module matrix (n_modules, dim)
    module_matrix = np.stack(embedded_modules["sentence_embedding_vector"].values)

    # Ensure student embedding is a 2D array (1, dim)
    student_vec = embedded_student_input
    if isinstance(student_vec, list):
        student_vec = np.array(student_vec)
    if student_vec.ndim == 1:
        student_vec = student_vec.reshape(1, -1)

    # Global cosine similarity between student and all modules
    scores_global = cosine_similarity(student_vec, module_matrix)[0]

    # Restrict to candidate set defined by filtered_df ids
    candidate_ids = set(filtered_df["id"].tolist())
    candidate_mask = embedded_modules["id"].isin(candidate_ids)

    scores_candidates = scores_global[candidate_mask.values]
    idx_candidates = np.where(candidate_mask.values)[0]

    if len(idx_candidates) == 0:
        raise ValueError("No modules remain after filtering; cannot compute recommendations.")

    # Select top-N among candidates
    order = np.argsort(-scores_candidates)[:top_n]
    top_idx = idx_candidates[order]

    # Map to module ids
    module_ids = embedded_modules.iloc[top_idx]["id"].values

    # Look up module names in raw_df
    module_names = []
    for mid in module_ids:
        row_match = raw_df[raw_df["id"] == mid]
        if not row_match.empty and "name" in row_match.columns:
            module_names.append(row_match.iloc[0]["name"])
        else:
            module_names.append("")

    recs = pd.DataFrame(
        {
            "rank": list(range(1, len(top_idx) + 1)),
            "module_id": module_ids,
            "module_name": module_names,
            "score": scores_candidates[order],
        }
    )

    return recs.reset_index(drop=True)


In [13]:
# Compute and show top‑5 recommendations
recs = recommend_with_sentence_embeddings(
    embedded_modules=embedded_modules,
    embedded_student_input=embedded_student_input,
    filtered_df=filtered_df,
    raw_df=raw_df,
    top_n=5,
)

print("Top 5 recommendations for the current student:")
display(recs)

Top 5 recommendations for the current student:


Unnamed: 0,rank,module_id,module_name,score
0,1,394,Avans Innovative Studio Junior,0.54178
1,2,391,Art & Humanity (nieuwe naam formuleren),0.538507
2,3,389,Performance in Art,0.51343
3,4,379,Creative AI,0.505214
4,5,375,Animatie / Storytelling,0.498969


## 7. Precision@k


In [14]:
def compute_precision_at_k(
    recs: pd.DataFrame,
    matching_models: Iterable[int],
    k: int = 5,
    id_column: str = "module_id",
) -> Tuple[float, int, int, Set[int], list]:

    if id_column not in recs.columns:
        raise KeyError(f"Column '{id_column}' not found in recommendations DataFrame.")

    relevant_ids: Set[int] = set(matching_models)
    recommended_ids = recs[id_column].tolist()

    k_used = min(k, len(recommended_ids))
    top_k_ids = recommended_ids[:k_used]

    hits = sum(1 for mid in top_k_ids if mid in relevant_ids)
    precision_at_k = hits / k_used if k_used > 0 else 0.0

    return precision_at_k, hits, k_used, relevant_ids, top_k_ids

In [15]:
precision_at_k, hits, k_used, relevant_ids, top_k_ids = compute_precision_at_k(
    recs=recs,
    matching_models=matching_models,  # your self-made ground truth
    k=5,
    id_column="module_id",
)

print(f"Relevant module IDs (ground truth): {sorted(relevant_ids)}")
print(f"Top-{k_used} recommended IDs: {top_k_ids}")
print(f"Hits in top-{k_used}: {hits}")
print(f"precision@{k_used}: {precision_at_k:.3f}")

Relevant module IDs (ground truth): [191, 233, 377, 379, 385, 386, 388, 389, 392]
Top-5 recommended IDs: [394, 391, 389, 379, 375]
Hits in top-5: 2
precision@5: 0.400


## 8. Motivation
The model needs to be able to motivate it's choices. We will add a new column where the model is gonna motivate itself.

In [16]:
from helpers.functs.motivation_se import add_motivation_column_se

student_profile_text = student.to_text()  # mens-vriendelijke versie

recs_with_motivation = add_motivation_column_se(
    recs=recs,
    student_profile_text=student_profile_text,
    preferred_language=getattr(student, "preferred_language", "NL"),
    raw_df=raw_df,           # voor module-naam/omschrijving
    model=model,             # je al geladen SentenceTransformer
)

recs = recs_with_motivation

old_width = pd.get_option("display.max_colwidth")
pd.set_option("display.max_colwidth", None)

display(recs)

pd.set_option("display.max_colwidth", old_width)

Unnamed: 0,rank,module_id,module_name,score,motivation_full
0,1,394,Avans Innovative Studio Junior,0.54178,"Avans Innovative Studio Junior is een sterke match met jouw interesses. Je profiel benadrukt vooral ""Carrière groei Sociale vaardigheden Zelfverzekerheid Vermaken Kunst & Onderzoek"", wat hier goed bij past."
1,2,391,Art & Humanity (nieuwe naam formuleren),0.538507,"Art & Humanity (nieuwe naam formuleren) is een sterke match met jouw interesses. Je profiel benadrukt vooral ""Tekening Animatie Kunst Artistiek Het vermaken van mensen"", wat hier goed bij past."
2,3,389,Performance in Art,0.51343,"Op basis van jouw antwoorden is een sterke match met jouw interesses. Vooral omdat je aangeeft: ""Via zingen, dansen, toneel""."
3,4,379,Creative AI,0.505214,"Creative AI is een sterke match met jouw interesses. Vooral omdat je aangeeft: ""Tekening Animatie Kunst Artistiek Het vermaken van mensen""."
4,5,375,Animatie / Storytelling,0.498969,"Animatie / Storytelling sluit goed aan bij jouw profiel. Vooral omdat je aangeeft: ""Tekening Animatie Kunst Artistiek Het vermaken van mensen""."
