# Model Training Sentence Embedding

In [1]:
from helpers.functs.StudentProfile import StudentProfile
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from helpers.functs.NLP import soft_nlp, hard_nlp
from typing import Iterable, Set, Tuple
import pandas as pd
import numpy as np
import ast

# Load dataset
df = pd.read_csv('../Data/Cleaned/cleaned_dataset_soft-NLP.csv')

# Loading uncleaned dataset for feedback names, etc. that have not seen NLP for user friendliness
raw_df = pd.read_csv('../Data/Raw/Uitgebreide_VKM_dataset.csv')

  from .autonotebook import tqdm as notebook_tqdm


Some of the first steps of preparing the data will be the same as we did in the training of the BOW model. This is why we won't explain all those steps again in this notebook.

## 0. Mocking a student profile (Copy of 3.1)

In [2]:
student = StudentProfile(
    current_study= "Kunst & Onderzoek",
    interests=[
        "Tekening",
        "Animatie",
        "Kunst",
        "Artistiek",
        "Het vermaken van mensen. Via zingen, dansen, toneel. Graag op het podium. "
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["Carrière groei", "Sociale vaardigheden", "Zelfverzekerheid", "Vermaken"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
)

matching_models = [388, 392, 191, 385, 386, 379, 389, 377, 233]

Creating a filtered dataset. Copy of the dataset used for comparison later

In [3]:
# Create filtered module and save. The filtered one won't be used by TF-IDF because that would create bias. (Smaller amount of modules compared > easier higher scores)
filtered_df = df.copy()

# Helper to normalize the list-like location strings such as "['Den Bosch', 'Tilburg']"
def normalize_locations(series):
    def _to_list(val):
        try:
            parsed = ast.literal_eval(str(val))
            if isinstance(parsed, list):
                return [str(x).strip().lower() for x in parsed]
            return [str(parsed).strip().lower()]
        except Exception:
            return [str(val).strip().lower()]
    return series.apply(_to_list)

# --- 1. Study credits range ---
if hasattr(student, "wanted_study_credit_range") and student.wanted_study_credit_range is not None:
    min_cred, max_cred = student.wanted_study_credit_range
    filtered_df = filtered_df[(filtered_df["studycredit"] >= min_cred) & (filtered_df["studycredit"] <= max_cred)]

# --- 2. Location preference ---
if hasattr(student, "location_preference") and student.location_preference:
    all_locs_filtered = normalize_locations(filtered_df["location"])
    loc_prefs_norm = [str(x).strip().lower() for x in student.location_preference]
    loc_mask = all_locs_filtered.apply(lambda lst: any(x in loc_prefs_norm for x in lst))
    filtered_df = filtered_df[loc_mask]

# --- 3. Language of the module vs preferred language of the student ---
# Pretty complicated to include and won't be of any use anyways since tf-idf won't be able to link interests written in difference language than de modules

# --- 4. Level preference (e.g. NLQF levels) ---
if hasattr(student, "level_preference") and student.level_preference:
    level_prefs = [str(x).strip().lower() for x in student.level_preference]
    filtered_df = filtered_df[filtered_df["level"].astype(str).str.lower().isin(level_prefs)]

# --- 5. Availability > 0 ---
filtered_df = filtered_df[filtered_df["available_spots"] > 0]

print(f"Original number of modules: {len(df)}")
print(f"Number of modules after filtering: {len(filtered_df)}")

Original number of modules: 211
Number of modules after filtering: 211


# 1. Combining Relevant Text Columns of Modules Dataset

In [4]:
# Combine relevant text columns 
big_string = (
    df["name"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["learningoutcomes"].fillna("") + " " +
    df["module_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
)

stringified_df = pd.DataFrame({
    "id": df["id"],
    "text": big_string
})

stringified_df.head()

Unnamed: 0,id,text
0,159,Kennismaking met Psychologie In deze module le...
1,160,Learning and working abroad Studenten kiezen b...
2,161,Proactieve zorgplanning Het Jeroen Bosch zieke...
3,162,Rouw en verlies In deze module wordt stil gest...
4,163,Acuut complexe zorg In deze module kunnen stud...


## 2. Vectorizing dataset
This time we'll be using sentence embedding for our vectorization. We selected SBERT with the multilingual model 'all-MiniLM-L6-v2' to handle both Dutch and English inputs and content. This model provides strong semantic understanding, allowing it to capture the meaning of student profiles and course descriptions very well. It also embeds both languages into the same vector space, enabling accurate cross-language comparisons.

In [5]:
# Loading sentence model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Encode big_df text with sentence embeddings
big_df_embeddings = model.encode(stringified_df["text"].tolist(), show_progress_bar=True)
big_df_embeddings = np.array(big_df_embeddings)
print(big_df_embeddings.shape)

Batches: 100%|██████████| 7/7 [00:07<00:00,  1.09s/it]


(211, 384)

In [7]:
sentence_embedded_dataframe = pd.DataFrame({
    "id": stringified_df["id"],
    "sentence_embedding_vector": list(big_df_embeddings)  
})
sentence_embedded_dataframe.head()

Unnamed: 0,id,sentence_embedding_vector
0,159,"[-0.04414652, 0.10954968, -0.01705466, -0.0051..."
1,160,"[0.024690889, 0.087618224, 0.012296621, -0.065..."
2,161,"[-0.057714682, 0.07327973, -0.051997118, -0.04..."
3,162,"[-0.0074989814, 0.058300607, 0.012031324, -0.0..."
4,163,"[-0.062754795, 0.058302496, -0.033168983, -0.0..."


## 3. Exporting Dataset
For tf-idf we just ran the whole notebook everytime since it only takes 0.8s. However now more computations are needed and it takes around 5 seconds so we decided to export it so it doesn't have to be calculated everytime. 


In [8]:
proccessed_path = '../Data/Processed/'

In [9]:
# Save the sentence embedded dataframe to Pickle and CSV
sentence_embedded_dataframe.to_pickle(proccessed_path + 'sentence_embedded_dataframe.pkl')
sentence_embedded_dataframe.to_csv(proccessed_path + 'sentence_embedded_dataframe.csv', index=False)
print('Saved dataframe with embeddings to', proccessed_path)

Saved dataframe with embeddings to ../Data/Processed/


## 3.1 Importing the Exported Dataset
After performing sentence embedding the final shape was (211, 384)

In [10]:
# Loading previously saved sentence embedded dataframe
embedded_modules = pd.read_csv(proccessed_path + 'sentence_embedded_dataframe.csv')

# Converting stringified numpy arrays back to actual numpy arrays
embedded_modules["sentence_embedding_vector"] = embedded_modules["sentence_embedding_vector"].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x)
)
embedded_modules.head()

Unnamed: 0,id,sentence_embedding_vector
0,159,"[-0.0441465192, 0.109549679, -0.0170546602, -0..."
1,160,"[0.0246908888, 0.0876182243, 0.0122966208, -0...."
2,161,"[-0.0577146821, 0.073279731, -0.0519971177, -0..."
3,162,"[-0.00749898143, 0.0583006069, 0.0120313242, -..."
4,163,"[-0.062754795, 0.0583024956, -0.0331689827, -0..."


# 4. Stringifying Student input and then Soft NLP

In [11]:
# Making the student interests, goals, and preferences into a single string
studentInterests = student.to_text()

# Applying soft NLP preprocessing
student_softNLP = soft_nlp(studentInterests)
print(student_softNLP)

Tekening Animatie Kunst Artistiek Het vermaken van mensen. Via zingen, dansen, toneel. Graag op het podium.  Carrière groei Sociale vaardigheden Zelfverzekerheid Vermaken Kunst & Onderzoek


## 5. Vectorization Student Input
Instead of using TF-IDF like we did earlier, now we will use 'all-MiniLM-L6-v2' on the student input.

In [None]:
# Encode student input with sentence embeddings
embedded_student_input = model.encode(student_softNLP, show_progress_bar=True)
print(embedded_student_input.shape)

Batches: 100%|██████████| 1/1 [00:00<00:00, 32.87it/s]

(384,)





## 6. Running cosine similarity on both matrixes
We have now vectorized both the student input and our modules data. Now we are able to run cosine similarity between them and look at the recommendations.

In [13]:
# Build a matrix of all module embeddings (n_modules, embedding_dim). This makes it easier to compute similarity with student embeddings!
module_matrix = np.stack(embedded_modules["sentence_embedding_vector"].values)

# Ensure student embedding is a 2D array (1, dim)
student_vec = embedded_student_input
if isinstance(student_vec, list):
    student_vec = np.array(student_vec)
if student_vec.ndim == 1:
    student_vec = student_vec.reshape(1, -1)

# Cosine similarity between student and all modules
scores = cosine_similarity(student_vec, module_matrix)[0]

# Only consider modules that pass filtering (e.g., location, credits, level)
candidate_ids = set(filtered_df["id"].tolist())
candidate_mask = embedded_modules["id"].isin(candidate_ids)
scores_candidates = scores[candidate_mask.values]
idx_candidates = np.where(candidate_mask.values)[0]

# Sort candidates by similarity score (descending) and pick top-5
order = np.argsort(-scores_candidates)[:5]
top_idx = idx_candidates[order]

# Map to module ids
module_ids = embedded_modules.iloc[top_idx]["id"].values

# Lookup module names for display in raw dataframe
module_names = []
for mid in module_ids:
    row_match = raw_df[raw_df["id"] == mid]
    if not row_match.empty and "name" in row_match.columns:
        module_names.append(row_match.iloc[0]["name"])
    else:
        module_names.append("")

# Create final recommendation dataframe
recs = pd.DataFrame({
    "rank": list(range(1, len(top_idx) + 1)),
    "module_id": module_ids,
    "module_name": module_names,
    "score": scores_candidates[order],
})

recs = recs.reset_index(drop=True)

# Display recommendations
print("Top 5 recommendations for the current student:")
recs

Top 5 recommendations for the current student:


Unnamed: 0,rank,module_id,module_name,score
0,1,191,De Kracht van de kunsten,0.641133
1,2,379,Creative AI,0.63715
2,3,389,Performance in Art,0.625273
3,4,391,Art & Humanity (nieuwe naam formuleren),0.605084
4,5,377,Art & Activisme,0.592945


## 7. Precision@k


In [None]:
def compute_precision_at_k(
    recs: pd.DataFrame,
    matching_models: Iterable[int],
    k: int = 5,
    id_column: str = "module_id",
) -> Tuple[float, int, int, Set[int], list]:


    if id_column not in recs.columns:
        raise KeyError(f"Column '{id_column}' not found in recommendations DataFrame.")

    relevant_ids: Set[int] = set(matching_models)
    recommended_ids = recs[id_column].tolist()

    k_used = min(k, len(recommended_ids))
    top_k_ids = recommended_ids[:k_used]

    hits = sum(1 for mid in top_k_ids if mid in relevant_ids)
    precision_at_k = hits / k_used if k_used > 0 else 0.0

    return precision_at_k, hits, k_used, relevant_ids, top_k_ids

In [15]:
precision_at_k, hits, k_used, relevant_ids, top_k_ids = compute_precision_at_k(
    recs=recs,
    matching_models=matching_models,  # your self-made ground truth
    k=5,
    id_column="module_id",
)

print(f"Relevant module IDs (ground truth): {sorted(relevant_ids)}")
print(f"Top-{k_used} recommended IDs: {top_k_ids}")
print(f"Hits in top-{k_used}: {hits}")
print(f"precision@{k_used}: {precision_at_k:.3f}")

Relevant module IDs (ground truth): [191, 233, 377, 379, 385, 386, 388, 389, 392]
Top-5 recommended IDs: [191, 379, 389, 391, 377]
Hits in top-5: 4
precision@5: 0.800


## 8. Motivation
The model needs to be able to motivate it's choices. We will add a new column where the model is gonna motivate itself.

In [16]:
from helpers.functs.motivation_se import add_motivation_column_se

student_profile_text = student.to_text()  # mens-vriendelijke versie

recs_with_motivation = add_motivation_column_se(
    recs=recs,
    student_profile_text=student_profile_text,
    preferred_language=getattr(student, "preferred_language", "NL"),
    raw_df=raw_df,           # voor module-naam/omschrijving
    model=model,             # je al geladen SentenceTransformer
)

recs = recs_with_motivation

old_width = pd.get_option("display.max_colwidth")
pd.set_option("display.max_colwidth", None)

display(recs)

pd.set_option("display.max_colwidth", old_width)

Unnamed: 0,rank,module_id,module_name,score,motivation_full
0,1,191,De Kracht van de kunsten,0.641133,"Op basis van jouw antwoorden past heel sterk bij wat jij leuk vindt. Vooral omdat je aangeeft: ""Tekening Animatie Kunst Artistiek Het vermaken van mensen""."
1,2,379,Creative AI,0.63715,"Op basis van jouw antwoorden past heel sterk bij wat jij leuk vindt. Vooral omdat je aangeeft: ""Tekening Animatie Kunst Artistiek Het vermaken van mensen""."
2,3,389,Performance in Art,0.625273,"Op basis van jouw antwoorden is een bijna perfecte match met jouw interesses. Dit sluit aan bij wat je vertelt: ""Tekening Animatie Kunst Artistiek Het vermaken van mensen""."
3,4,391,Art & Humanity (nieuwe naam formuleren),0.605084,"Op basis van jouw antwoorden sluit extreem goed aan bij jouw profiel. Je profiel benadrukt vooral ""Carrière groei Sociale vaardigheden Zelfverzekerheid Vermaken Kunst & Onderzoek"", wat hier goed bij past."
4,5,377,Art & Activisme,0.592945,"Art & Activisme is een sterke match met jouw interesses. Dit sluit aan bij wat je vertelt: ""Carrière groei Sociale vaardigheden Zelfverzekerheid Vermaken Kunst & Onderzoek""."
