# Model Training Sentence Embedding

In [None]:
from helpers.functs.StudentProfile import StudentProfile
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import ast

# Load dataset
df = pd.read_csv('../Data/Cleaned/cleaned_dataset_hard-NLP.csv')

# Loading uncleaned dataset for feedback names, etc. that have not seen NLP for user friendliness
raw_df = pd.read_csv('../Data/Raw/Uitgebreide_VKM_dataset.csv')

Some of the first steps of preparing the data will be the same as we did in the training of the BOW model. This is why we won't explain all those steps again in this notebook.

## 0. Mocking a student profile (Copy of 3.1)

In [28]:
student = StudentProfile(
    current_study= "Kunst & Onderzoek",
    interests=[
        "Tekening",
        "Animatie",
        "Kunst",
        "Artistiek",
        "Het vermaken van mensen. Via zingen, dansen, toneel. Graag op het podium. "
    ],
    wanted_study_credit_range=(15, 30),
    location_preference=["Den Bosch", "Breda", "Tilburg"],
    learning_goals=["CarriÃ¨re groei", "Sociale vaardigheden", "Zelfverzekerheid", "Vermaken"],
    level_preference=["NLQF5", "NLQF6"],
    preferred_language="NL",
)

matching_models = [388, 392, 191, 385, 386, 379, 389, 377, 233]

Creating a filtered dataset. Copy of the dataset used for comparison later

In [29]:
# Create filtered module and save. The filtered one won't be used by TF-IDF because that would create bias. (Smaller amount of modules compared > easier higher scores)
filtered_df = df.copy()

# Helper to normalize the list-like location strings such as "['Den Bosch', 'Tilburg']"
def normalize_locations(series):
    def _to_list(val):
        try:
            parsed = ast.literal_eval(str(val))
            if isinstance(parsed, list):
                return [str(x).strip().lower() for x in parsed]
            return [str(parsed).strip().lower()]
        except Exception:
            return [str(val).strip().lower()]
    return series.apply(_to_list)

# --- 1. Study credits range ---
if hasattr(student, "wanted_study_credit_range") and student.wanted_study_credit_range is not None:
    min_cred, max_cred = student.wanted_study_credit_range
    filtered_df = filtered_df[(filtered_df["studycredit"] >= min_cred) & (filtered_df["studycredit"] <= max_cred)]

# --- 2. Location preference ---
if hasattr(student, "location_preference") and student.location_preference:
    all_locs_filtered = normalize_locations(filtered_df["location"])
    loc_prefs_norm = [str(x).strip().lower() for x in student.location_preference]
    loc_mask = all_locs_filtered.apply(lambda lst: any(x in loc_prefs_norm for x in lst))
    filtered_df = filtered_df[loc_mask]

# --- 3. Language of the module vs preferred language of the student ---
# Pretty complicated to include and won't be of any use anyways since tf-idf won't be able to link interests written in difference language than de modules

# --- 4. Level preference (e.g. NLQF levels) ---
if hasattr(student, "level_preference") and student.level_preference:
    level_prefs = [str(x).strip().lower() for x in student.level_preference]
    filtered_df = filtered_df[filtered_df["level"].astype(str).str.lower().isin(level_prefs)]

# --- 5. Availability > 0 ---
filtered_df = filtered_df[filtered_df["available_spots"] > 0]

print(f"Original number of modules: {len(df)}")
print(f"Number of modules after filtering: {len(filtered_df)}")

Original number of modules: 211
Number of modules after filtering: 211


# 1. Combining Relevant Text Columns of Modules Dataset

In [None]:
# # Combine relevant text columns 
# big_string = (
#     df["name"].fillna("") + " " +
#     df["description"].fillna("") + " " +
#     df["learningoutcomes"].fillna("") + " " +
#     df["module_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
# )

# stringified_df = pd.DataFrame({
#     "id": df["id"],
#     "text": big_string
# })

# stringified_df.head()

Unnamed: 0,id,text
0,159,kennismak psychologi modul ler gedrag jezelf a...
1,160,learn work abroad student kiez binn stam oplei...
2,161,proactiev zorgplann jeroen bosch ziekenhuis gr...
3,162,rouw verlies modul stil gestan rouw verlies va...
4,163,acuut complex zorg modul student verdiep acut ...


## 2. Vectorizing dataset
This time we'll be using sentence embedding for our vectorization. We selected SBERT with the multilingual model paraphrase-multilingual-MiniLM-L12-v2 to handle both Dutch and English inputs and content. This model provides strong semantic understanding, allowing it to capture the meaning of student profiles and course descriptions very well. It also embeds both languages into the same vector space, enabling accurate cross-language comparisons.

In [None]:
# # Loading sentence model
# model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

In [None]:
# # Encode big_df text with sentence embeddings
# big_df_embeddings = model.encode(stringified_df["text"].tolist(), show_progress_bar=True)
# big_df_embeddings = np.array(big_df_embeddings)
# big_df_embeddings.shape

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

(211, 768)

In [None]:
# sentence_embedded_dataframe = pd.DataFrame({
#     "id": stringified_df["id"],
#     "sentence_embedding_vector": list(big_df_embeddings)  
# })
# sentence_embedded_dataframe.head()


Unnamed: 0,id,sentence_embedding_vector
0,159,"[-0.025432192, -0.09378667, -0.0148883555, -0...."
1,160,"[-0.120328344, -0.13125975, -0.009022686, 0.02..."
2,161,"[-0.06430991, -0.10150048, -0.012131711, -0.05..."
3,162,"[0.047558933, -0.016387764, -0.017794589, -0.0..."
4,163,"[-0.06947176, -0.22489156, -0.013585485, -0.03..."


## 3. Exporting Dataset
For tf-idf we just ran the whole notebook everytime since it only takes 0.8s. However now more computations are needed and it takes around 5 seconds so we decided to export it so it doesn't have to be calculated everytime. 


In [None]:
# output_csv = '../Data/Vectorized/sentence_embedded_dataframe.csv'
# sentence_embedded_dataframe.to_csv(output_csv, index=False)
# print('Saved dataframe to', output_csv)


Saved dataframe to ../Data/Vectorized/sentence_embedded_dataframe.csv
