In [1]:
# ! pip install numpy==1.24.4 pandas==2.1.4 scikit-learn==1.2.2 jupyter


In [2]:
import os

In [3]:
import numpy as np
import pandas as pd
import sklearn


In [4]:
# import
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error


In [None]:
# import data
df = pd.read_csv("eligible.csv")

In [6]:
# process the interests_list and field_tags
df["interests_list"] = df["interests"].apply(lambda x: [s.strip() for s in x.split(";")])
df["field_tags_list"] = df["field_tags"].apply(lambda x: [s.strip() for s in x.split(";")])

mlb = MultiLabelBinarizer()
student_vecs = mlb.fit_transform(df["interests_list"])
program_vecs = mlb.transform(df["field_tags_list"])


In [7]:
student_df = pd.DataFrame(student_vecs, columns=[f"stu_{t}" for t in mlb.classes_])
program_df = pd.DataFrame(program_vecs, columns=[f"prog_{t}" for t in mlb.classes_])

df_full = pd.concat([df.reset_index(drop=True), student_df, program_df], axis=1)


In [8]:
X = df_full.drop(columns=["student_id","program_id","interests","field_tags","interests_list","field_tags_list","label_match"])
y = df_full["label_match"]

In [9]:
num_features = ["overall_ranking", "wight"]
cat_features = ["degree_goal", "degree_level"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ],
    remainder="passthrough"  # one-hot
)

# 5. KNN pipeline
knn = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", KNeighborsRegressor(n_neighbors=5, metric="euclidean"))
])

# 6. val_training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. training
knn.fit(X_train, y_train)

# 8. predict
y_pred = knn.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))


MSE: 0.0006393419161502429


In [10]:
def recommend_for_new_student(student_id, interests, degree_goal, df_programs, df_train, model, top_k=5):

    new_student_data = df_programs.copy()
    new_student_data["student_id"] = student_id
    new_student_data["interests"] = interests
    new_student_data["degree_goal"] = degree_goal
    new_student_data["interests_list"] = [interests.split(";")] * len(new_student_data)
    new_student_data["field_tags_list"] = new_student_data["field_tags"].apply(
        lambda x: x.split(";") if isinstance(x, str) else []
    )

    all_feature_cols = list(model.feature_names_in_)


    prog_cols = [c for c in all_feature_cols if c.startswith("prog_")]
    stu_cols = [c for c in all_feature_cols if c.startswith("stu_")]

    for col in prog_cols:
        tag = col.replace("prog_", "")
        new_student_data[col] = new_student_data["field_tags"].apply(lambda s: 1 if isinstance(s, str) and tag in s else 0)


    student_interests = [s.strip() for s in interests.split(";")]
    for col in stu_cols:
        tag = col.replace("stu_", "")
        new_student_data[col] = 1 if tag in student_interests else 0


    for col in all_feature_cols:
        if col not in new_student_data.columns:
            new_student_data[col] = 0  

    X_new = new_student_data[all_feature_cols]

    new_student_data["predicted_score"] = model.predict(X_new)

    return new_student_data[["program_id", "field_tags", "degree_level", "predicted_score"]] \
        .sort_values("predicted_score", ascending=False) \
        .head(top_k)


In [11]:
programs_df = pd.read_csv("programs.csv")

recommend_for_new_student(
    student_id=1001,
    #interests="Information Technology;Renewable Energy",
    interests="Business",
    degree_goal="Master",
    df_programs=programs_df,
    df_train=df_full, 
    model=knn,
    top_k=5
)



Unnamed: 0,program_id,field_tags,degree_level,predicted_score
735,736,Business;Environmental Science;Architecture,Bachelor,0.412
79,80,Law;Business;Artificial Intelligence,Diploma,0.406
702,703,Environmental Science;Public Health;Business,Master,0.404
744,745,Engineering;Finance;Education,Master,0.402
709,710,Engineering;Marketing;Education,Master,0.402
