Libs & Files

In [1]:
import pandas as pd

In [2]:
students_df = pd.read_csv("students.csv")
projects_df = pd.read_csv("projects.csv")

In [3]:
students_df.head()

Unnamed: 0,student_id,name,year,gpa,skills_text,current_assignments,availability
0,S001,Somchai,4,3.45,"Python, Django, REST API, Machine Learning, Re...",1,high
1,S002,Suriya,3,3.8,"React, HTML, CSS, UX Design, Frontend Development",0,high
2,S003,Aom,2,3.2,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0,medium
3,S004,Nok,4,3.6,"Python, Django, Authentication, REST, Software...",4,low
4,S005,Kai,3,3.1,"Java, Spring Boot, SQL, Backend Development",2,medium


In [4]:
projects_df.head()

Unnamed: 0,project_id,title,description,required_skills_text,quota,min_year,min_score,difficulty
0,P001,NLP Chatbot,Build an intent classification chatbot for cus...,"Python, NLP, Text Classification, Embeddings",2,3,0.6,hard
1,P002,Django Web App,Develop a Django web application with authenti...,"Python, Django, REST API, Authentication",1,2,0.65,medium
2,P003,Frontend Dashboard,Redesign analytics dashboard UI,"React, Frontend Development, UX",2,2,0.55,easy
3,P004,Data Pipeline,Create ETL pipeline for analytics data,"Python, Data Engineering, SQL, ETL",1,3,0.6,medium
4,P005,MLOps Infrastructure,Set up CI/CD and deployment for ML models,"MLOps, Docker, Kubernetes, CI/CD",1,4,0.7,hard


Embedding Similar skill

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def embed_text(text: str):
    return model.encode(text, normalize_embeddings=True)

students_df["embedding"] = students_df["skills_text"].apply(embed_text)
projects_df["embedding"] = projects_df["required_skills_text"].apply(embed_text)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def cosine_sim(vec1, vec2):
    return float(cosine_similarity([vec1], [vec2])[0][0])

In [8]:
def compute_coverage(student_skills, project_skills, threshold=0.5):
    student_list = [s.strip() for s in student_skills.split(",")]
    project_list = [s.strip() for s in project_skills.split(",")]

    student_embs = [embed_text(s) for s in student_list]
    project_embs = [embed_text(p) for p in project_list]

    matched = 0
    for p_emb in project_embs:
        sims = [cosine_sim(s_emb, p_emb) for s_emb in student_embs]
        if max(sims) >= threshold:
            matched += 1

    return matched / len(project_list)


In [9]:
a = students_df['embedding'].loc[3]

In [10]:
b = projects_df['embedding'].loc[1]

In [11]:
score = cosine_sim(a, b)

In [12]:
score

0.8857206106185913

In [13]:
def rank_students_for_project(project_row, students_df):
    results = []

    for _, student in students_df.iterrows():
        score = cosine_sim(student["embedding"], project_row["embedding"])
        coverage = compute_coverage(
            student["skills_text"],
            project_row["required_skills_text"]
        )

        results.append({
            "student_id": student["student_id"],
            "student_name": student["name"],
            "student_skills_text": student["skills_text"],
            "score": round(score, 3),
            "coverage": round(coverage, 2),
            "current_assignments": student["current_assignments"]
        })

    return (
        pd.DataFrame(results)
        .sort_values("score", ascending=False)
        .reset_index(drop=True)
    )


In [14]:
project = projects_df.loc[projects_df["project_id"] == "P001"].iloc[0]
matches = rank_students_for_project(project, students_df)

matches

Unnamed: 0,student_id,student_name,student_skills_text,score,coverage,current_assignments
0,S006,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.917,1.0,1
1,S001,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.432,0.5,1
2,S003,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.38,0.25,0
3,S008,May,"Data Analysis, Pandas, Visualization, Statistics",0.34,0.0,0
4,S004,Nok,"Python, Django, Authentication, REST, Software...",0.302,0.25,4
5,S002,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.121,0.0,0
6,S007,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.062,0.0,3
7,S005,Kai,"Java, Spring Boot, SQL, Backend Development",0.031,0.0,2


In [15]:
project

project_id                                                           P001
title                                                         NLP Chatbot
description             Build an intent classification chatbot for cus...
required_skills_text         Python, NLP, Text Classification, Embeddings
quota                                                                   2
min_year                                                                3
min_score                                                             0.6
difficulty                                                           hard
embedding               [-0.0489649, -0.088755906, 0.019178802, 0.0129...
Name: 0, dtype: object

### Constraints & Result

In [16]:
MAX_PROJECTS = 5

In [17]:
def constraints(match_row, project_row):
    if match_row["score"] < project_row["min_score"]:
        return "REJECT", ["Semantic match score below threshold"]

    if match_row["coverage"] < 0.5:
        return "UPSKILL", ["Insufficient skill coverage"]

    return "ACCEPT", ["Score and coverage meet project requirements"]


In [18]:
def progressive_match(project_row, ranked_df, initial_k=5, step_k=3):
    quota = project_row["quota"]
    evaluated = []
    accepted_count = 0

    k = initial_k
    seen = set()

    while accepted_count < quota and k <= len(ranked_df):
        batch = ranked_df.iloc[:k]

        for _, row in batch.iterrows():
            if row["student_id"] in seen:
                continue
            seen.add(row["student_id"])

            # Hard constraint (capacity)
            if row["current_assignments"] >= MAX_PROJECTS:
                decision, rationale = "REJECT", ["Student reached max project limit"]
            else:
                decision, rationale = constraints(row, project_row)

            evaluated.append({
                **row.to_dict(),
                "decision": decision,
                "rationale": "; ".join(rationale)
            })

            if decision == "ACCEPT":
                accepted_count += 1
                if accepted_count == quota:
                    break

        k += step_k

    return pd.DataFrame(evaluated)


In [19]:
decisions = []

In [20]:
def enforce_quota(decision_df, project_row):
    quota = project_row["quota"]

    accepted = (
        decision_df[decision_df["decision"] == "ACCEPT"]
        .sort_values("score", ascending=False)
        .head(quota)
    )

    waitlist = (
        decision_df[decision_df["decision"] == "ACCEPT"]
        .sort_values("score", ascending=False)
        .iloc[quota:]
    )

    final_df = decision_df.copy()
    final_df.loc[waitlist.index, "decision"] = "WAITLIST"
    final_df.loc[waitlist.index, "rationale"] = "Quota exceeded; added to waitlist"

    return final_df

In [21]:
ranked_df = rank_students_for_project(project, students_df)

decision_df = progressive_match(
    project_row=project,
    ranked_df=ranked_df,
    initial_k=5,
    step_k=3
)

decision_df = enforce_quota(decision_df, project)
decision_df

Unnamed: 0,student_id,student_name,student_skills_text,score,coverage,current_assignments,decision,rationale
0,S006,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.917,1.0,1,ACCEPT,Score and coverage meet project requirements
1,S001,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.432,0.5,1,REJECT,Semantic match score below threshold
2,S003,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.38,0.25,0,REJECT,Semantic match score below threshold
3,S008,May,"Data Analysis, Pandas, Visualization, Statistics",0.34,0.0,0,REJECT,Semantic match score below threshold
4,S004,Nok,"Python, Django, Authentication, REST, Software...",0.302,0.25,4,REJECT,Semantic match score below threshold
5,S002,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.121,0.0,0,REJECT,Semantic match score below threshold
6,S007,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.062,0.0,3,REJECT,Semantic match score below threshold
7,S005,Kai,"Java, Spring Boot, SQL, Backend Development",0.031,0.0,2,REJECT,Semantic match score below threshold


In [22]:
def apply_acceptance(decision_df, students_df):
    updated_students = students_df.copy()

    accepted_ids = decision_df[
        decision_df["decision"] == "ACCEPT"
    ]["student_id"].tolist()

    updated_students.loc[
        updated_students["student_id"].isin(accepted_ids),
        "current_assignments"
    ] += 1

    return updated_students

In [23]:
students_df = apply_acceptance(decision_df, students_df)
students_df[["student_id", "current_assignments"]]

Unnamed: 0,student_id,current_assignments
0,S001,1
1,S002,0
2,S003,0
3,S004,4
4,S005,2
5,S006,2
6,S007,3
7,S008,0


In [24]:
def generate_alerts(decision_df):
    alerts = []

    for _, row in decision_df.iterrows():
        if row["decision"] in ["WAITLIST", "REJECT", "UPSKILL"]:
            alerts.append(
                f"[{row['decision']}] {row['student_id']}: {row['rationale']}"
            )
    return alerts

alerts = generate_alerts(decision_df)
for a in alerts:
    print(a)

[REJECT] S001: Semantic match score below threshold
[REJECT] S003: Semantic match score below threshold
[REJECT] S008: Semantic match score below threshold
[REJECT] S004: Semantic match score below threshold
[REJECT] S002: Semantic match score below threshold
[REJECT] S007: Semantic match score below threshold
[REJECT] S005: Semantic match score below threshold


In [25]:
from datetime import datetime
import os

RUN_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
RUN_DIR = f"logs/run_{RUN_ID}"

os.makedirs(RUN_DIR, exist_ok=True)


def persist_decisions(decision_df):
    decision_df.to_csv(
        f"{RUN_DIR}/decisions.csv",
        index=False
    )
    
persist_decisions(decision_df)



def persist_accepted(decision_df, project_row):
    accepted = decision_df[decision_df["decision"] == "ACCEPT"].copy()

    accepted["project_id"] = project_row["project_id"]
    accepted["project_title"] = project_row["title"]
    accepted["run_id"] = RUN_ID

    cols = [
        "run_id",
        "project_id",
        "project_title",
        "student_id",
        "student_name",
        "score"
    ]

    accepted[cols].to_csv(
        f"{RUN_DIR}/accepted.csv",
        index=False
    )
    
persist_accepted(decision_df, project)

def persist_alerts(alerts):
    with open(f"{RUN_DIR}/alerts.txt", "w", encoding="utf-8") as f:
        for alert in alerts:
            f.write(alert + "\n")
            
alerts = generate_alerts(decision_df)
persist_alerts(alerts)


In [26]:
from typing import List, Dict


def build_agent_summary(project_row, decision_df):
    accepted = decision_df[decision_df["decision"] == "ACCEPT"]
    rejected = decision_df[decision_df["decision"] == "REJECT"]
    upskill = decision_df[decision_df["decision"] == "UPSKILL"]


    summary = {
        "project_id": project_row["project_id"],
        "project_title": project_row["title"],
        "quota": project_row["quota"],
        "accepted_count": len(accepted),
        "accepted_students": accepted[["student_id", "student_name", "score"]].to_dict("records"),
        "rejected_students": rejected[["student_id", "rationale"]].to_dict("records"),
        "upskill_candidates": upskill[["student_id", "rationale"]].to_dict("records"),
    }
    return summary


In [27]:
agent_summary = build_agent_summary(project, decision_df)
agent_summary

{'project_id': 'P001',
 'project_title': 'NLP Chatbot',
 'quota': np.int64(2),
 'accepted_count': 1,
 'accepted_students': [{'student_id': 'S006',
   'student_name': 'Pim',
   'score': 0.917}],
 'rejected_students': [{'student_id': 'S001',
   'rationale': 'Semantic match score below threshold'},
  {'student_id': 'S003', 'rationale': 'Semantic match score below threshold'},
  {'student_id': 'S008', 'rationale': 'Semantic match score below threshold'},
  {'student_id': 'S004', 'rationale': 'Semantic match score below threshold'},
  {'student_id': 'S002', 'rationale': 'Semantic match score below threshold'},
  {'student_id': 'S007', 'rationale': 'Semantic match score below threshold'},
  {'student_id': 'S005', 'rationale': 'Semantic match score below threshold'}],
 'upskill_candidates': []}

## LLM

In [28]:
def build_explanation_context(match_row, project_row):
    return {
        "project_title": project_row["title"],
        "project_description": project_row["description"],
        "project_required_skills": project_row["required_skills_text"],

        "student_name": match_row["student_name"],
        "student_skills": match_row["student_skills_text"],

        "semantic_score": round(match_row["score"], 3),
        "skill_coverage": round(match_row["coverage"], 2),

        "decision": match_row["decision"],
        "rule_rationale": match_row["rationale"],
    }


ollama local

In [None]:
import dspy

llm = dspy.LM(
    model="ollama/llama3.1:8b",
    temperature=0.0,
    max_tokens=200
)

dspy.settings.configure(lm=llm)

อธิบายเหตุผลการ matching

In [30]:
class ExplainDecision(dspy.Signature):
    """
    Explain why the matching agent made a decision for a student-project pair.
    The explanation must strictly follow the given facts.
    """
    context = dspy.InputField()
    explanation = dspy.OutputField(desc="2-3 bullet points")

In [31]:
class DecisionExplainer(dspy.Module):
    def __init__(self):
        super().__init__()
        self.explain = dspy.ChainOfThought(ExplainDecision)

    def forward(self, context):
        return self.explain(context=context)

In [32]:
explainer = DecisionExplainer()
explanations = []

for _, row in decision_df.iterrows():
    if row["decision"] == "ACCEPT":
        continue 

    context = build_explanation_context(row, project)
    result = explainer(context)

    explanations.append({
        "student_id": row["student_id"],
        "decision": row["decision"],
        "explanation": result.explanation
    })

In [33]:
def persist_llm_alerts(explanations):
    path = f"{RUN_DIR}/alerts.txt"
    with open(path, "w", encoding="utf-8") as f:
        for e in explanations:
            f.write(f"[{e['decision']}] Student {e['student_id']}\n")
            f.write(e["explanation"].strip() + "\n\n")
    return path


In [34]:
alert_path = persist_llm_alerts(explanations)
print("LLM alerts saved to:", alert_path)

LLM alerts saved to: logs/run_2025-12-23_20-30-08/alerts.txt


In [35]:
class ValidateDecision(dspy.Signature):
    """
    Validate whether the decision made by the matching agent
    is consistent with the provided scores and context.
    """
    context = dspy.InputField(desc="Structured decision context")
    verdict = dspy.OutputField(
        desc="One of: OK, SUSPICIOUS"
    )
    comment = dspy.OutputField(
        desc="Short reason explaining the verdict (1-2 sentences)"
    )

In [36]:
class DecisionValidator(dspy.Module):
    def __init__(self):
        super().__init__()
        self.validate = dspy.ChainOfThought(ValidateDecision)

    def forward(self, context):
        return self.validate(context=context)


In [37]:
validator = DecisionValidator()


In [38]:
validation_results = []

for _, row in decision_df.iterrows():
    context = build_explanation_context(row, project)
    result = validator(context)

    validation_results.append({
        "student_id": row["student_id"],
        "decision": row["decision"],
        "verdict": result.verdict,
        "comment": result.comment
    })

validation_df = pd.DataFrame(validation_results)
validation_df


Unnamed: 0,student_id,decision,verdict,comment
0,S006,ACCEPT,OK,The high semantic score and complete skill cov...
1,S001,REJECT,REJECT,The student's skills do not closely align with...
2,S003,REJECT,REJECT,The student's skills do not sufficiently cover...
3,S008,REJECT,SUSPICIOUS,Semantic match score below threshold; decision...
4,S004,REJECT,REJECT,The student's skills do not sufficiently cover...
5,S002,REJECT,REJECT,The low semantic match score indicates that th...
6,S007,REJECT,REJECT,The low semantic match score indicates that th...
7,S005,REJECT,REJECT,The low semantic match score indicates that th...


In [39]:
def persist_validation_alerts(validation_df):
    suspicious = validation_df[validation_df["verdict"] == "SUSPICIOUS"]

    if suspicious.empty:
        return

    with open(f"{RUN_DIR}/validator_alerts.txt", "w", encoding="utf-8") as f:
        for _, row in suspicious.iterrows():
            f.write(
                f"[SUSPICIOUS] Student {row['student_id']} "
                f"(decision={row['decision']}): {row['comment']}\n"
            )
persist_validation_alerts(validation_df)

In [40]:
class RecommendUpskill(dspy.Signature):
    """
    Recommend skill improvements for a student based on
    the gap between student skills and project requirements.
    """
    student_skills = dspy.InputField(desc="Current student skills")
    required_skills = dspy.InputField(desc="Project required skills")
    recommendation = dspy.OutputField(
        desc="Concrete skill gap analysis and learning recommendation (2-3 bullet points)"
    )


In [41]:
class UpskillCoach(dspy.Module):
    def __init__(self):
        super().__init__()
        self.recommend = dspy.ChainOfThought(RecommendUpskill)

    def forward(self, student_skills, required_skills):
        return self.recommend(
            student_skills=student_skills,
            required_skills=required_skills
        )

In [42]:
upskill_coach = UpskillCoach()

In [43]:
upskill_results = []

for _, row in decision_df.iterrows():
    if row["decision"] not in ["UPSKILL", "REJECT"]:
        continue

    result = upskill_coach(
        student_skills=row["student_skills_text"],
        required_skills=project["required_skills_text"]
    )

    upskill_results.append({
        "student_id": row["student_id"],
        "decision": row["decision"],
        "recommendation": result.recommendation
    })

upskill_df = pd.DataFrame(upskill_results)

In [44]:
upskill_df

Unnamed: 0,student_id,decision,recommendation
0,S001,REJECT,['Improve knowledge of Natural Language Proces...
1,S003,REJECT,['Improve knowledge of Natural Language Proces...
2,S008,REJECT,* **Python Programming**: The student should...
3,S004,REJECT,* **NLP Fundamentals**: Study the basics of ...
4,S002,REJECT,* The student should learn the basics of Pytho...
5,S007,REJECT,* The student should focus on learning Python ...
6,S005,REJECT,* The student should learn Python as it is a f...


In [45]:
def persist_upskill_plans(upskill_df):
    if upskill_df.empty:
        return

    path = f"{RUN_DIR}/upskill_plans.csv"
    upskill_df.to_csv(path, index=False)
    return path


In [46]:
upskill_path = persist_upskill_plans(upskill_df)
print("Upskill plans saved to:", upskill_path)

Upskill plans saved to: logs/run_2025-12-23_20-30-08/upskill_plans.csv
