In [15]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# reuse from system
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embedding_model = SentenceTransformer("all-mpnet-base-v2")



In [16]:
students_df = pd.read_csv("students.csv")
students_df.head()


Unnamed: 0,student_id,name,year,gpa,skills_text,current_assignments,max_capacity
0,S001,Somchai,4,3.45,"Python, Django, REST API, Machine Learning, Re...",1,3
1,S002,Suriya,3,3.8,"React, HTML, CSS, UX Design, Frontend Development",0,4
2,S003,Aom,2,3.2,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0,5
3,S004,Nok,4,3.6,"Python, Django, Authentication, REST, Software...",4,2
4,S005,Kai,3,3.1,"Java, Spring Boot, SQL, Backend Development",2,2


In [17]:
from dataclasses import dataclass
from typing import List

@dataclass
class ProjectRole:
    role: str
    quota: int
    responsibility: str
    required_skills: List[str]

@dataclass
class ProjectSpecification:
    project_summary: str
    project_type: List[str]
    headcount: int
    duration_months: int
    roles: List[ProjectRole]
    assumptions: List[str]
    risks: List[str]


In [None]:
spec = ProjectSpecification(
    project_summary="Build ML system for churn prediction",
    project_type=["ML", "Data"],
    headcount=2,
    duration_months=3,
    roles=[
        ProjectRole(
            role="datasci",
            quota=1,
            responsibility="Modeling",
            required_skills=["python", "Machine Learning", "Django", "REST API"]
        ),
        ProjectRole(
            role="dataen",
            quota=1,
            responsibility="Pipeline",
            required_skills=["sql", "etl", "airflow"]
        ),
    ],
    assumptions=[],
    risks=[]
)

In [19]:
def project_spec_to_row(spec: ProjectSpecification) -> pd.Series:
    return pd.Series({
        "role_skill_map": {
            r.role.lower(): [s.lower() for s in r.required_skills]
            for r in spec.roles
        },
        "role_quota_map": {
            r.role.lower(): r.quota
            for r in spec.roles
        },
        "min_score": 0.35
    })

project_row = project_spec_to_row(spec)

In [20]:
project_row['role_skill_map']

{'datasci': ['python', 'machine learning', 'django', 'rest api'],
 'dataen': ['sql', 'etl', 'airflow']}

In [21]:
def embed(text):
    return embedding_model.encode(text, normalize_embeddings=True)

embed("python, ml, pytorch").shape

(384,)

In [22]:
def rank_students_for_role(role, skills, students_df):
    role_text = ", ".join(skills)
    role_emb = embed(role_text)

    rows = []
    for _, s in students_df.iterrows():
        student_emb = embed(s["skills_text"])
        score = cosine_similarity([student_emb], [role_emb])[0][0]

        rows.append({
            "student": s["name"],
            "skills": s["skills_text"],
            "score": round(float(score), 2)
        })

    return pd.DataFrame(rows).sort_values("score", ascending=False)


In [23]:
rank_datasci = rank_students_for_role(
    "datasci",
    project_row["role_skill_map"]["datasci"],
    students_df
)

rank_datasci.head(10)


Unnamed: 0,student,skills,score
0,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.9
3,Nok,"Python, Django, Authentication, REST, Software...",0.78
2,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.52
5,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.42
7,May,"Data Analysis, Pandas, Visualization, Statistics",0.29
4,Kai,"Java, Spring Boot, SQL, Backend Development",0.26
1,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.24
6,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.12


In [24]:
rank_dataen = rank_students_for_role(
    "dataen",
    project_row["role_skill_map"]["dataen"],
    students_df
)

rank_dataen.head(10)


Unnamed: 0,student,skills,score
2,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.55
4,Kai,"Java, Spring Boot, SQL, Backend Development",0.31
6,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.21
5,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.18
7,May,"Data Analysis, Pandas, Visualization, Statistics",0.18
0,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.11
3,Nok,"Python, Django, Authentication, REST, Software...",0.1
1,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.1


In [25]:
def compute_coverage(student_skills, required_skills):
    sset = set(student_skills.lower().split(","))
    rset = set(required_skills)
    return sum(any(r in s for s in sset) for r in rset) / len(rset)

rank_datasci["coverage"] = rank_datasci["skills"].apply(
    lambda x: compute_coverage(x, project_row["role_skill_map"]["datasci"])
)

rank_datasci.head(10)


Unnamed: 0,student,skills,score,coverage
0,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.9,1.0
3,Nok,"Python, Django, Authentication, REST, Software...",0.78,0.5
2,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.52,0.25
5,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.42,0.25
7,May,"Data Analysis, Pandas, Visualization, Statistics",0.29,0.0
4,Kai,"Java, Spring Boot, SQL, Backend Development",0.26,0.0
1,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.24,0.0
6,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.12,0.0


In [26]:
def compute_coverage(student_skills, required_skills):
    sset = set(student_skills.lower().split(","))
    rset = set(required_skills)
    return sum(any(r in s for s in sset) for r in rset) / len(rset)

rank_dataen["coverage"] = rank_dataen["skills"].apply(
    lambda x: compute_coverage(x, project_row["role_skill_map"]["dataen"])
)

rank_dataen.head(10)

Unnamed: 0,student,skills,score,coverage
2,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.55,0.666667
4,Kai,"Java, Spring Boot, SQL, Backend Development",0.31,0.333333
6,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.21,0.0
5,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.18,0.0
7,May,"Data Analysis, Pandas, Visualization, Statistics",0.18,0.0
0,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.11,0.0
3,Nok,"Python, Django, Authentication, REST, Software...",0.1,0.0
1,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.1,0.0


In [27]:
def decide_row(row, min_score=0.35):
    if row["score"] < min_score:
        return "REJECT"
    if row["coverage"] < 0.5:
        return "UPSKILL"
    return "ACCEPT"

rank_datasci["decision"] = rank_datasci.apply(decide_row, axis=1)
rank_datasci.head(10)


Unnamed: 0,student,skills,score,coverage,decision
0,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.9,1.0,ACCEPT
3,Nok,"Python, Django, Authentication, REST, Software...",0.78,0.5,ACCEPT
2,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.52,0.25,UPSKILL
5,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.42,0.25,UPSKILL
7,May,"Data Analysis, Pandas, Visualization, Statistics",0.29,0.0,REJECT
4,Kai,"Java, Spring Boot, SQL, Backend Development",0.26,0.0,REJECT
1,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.24,0.0,REJECT
6,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.12,0.0,REJECT


In [28]:
def decide_row(row, min_score=0.35):
    if row["score"] < min_score:
        return "REJECT"
    if row["coverage"] < 0.5:
        return "UPSKILL"
    return "ACCEPT"

rank_dataen["decision"] = rank_dataen.apply(decide_row, axis=1)
rank_dataen.head(10)

Unnamed: 0,student,skills,score,coverage,decision
2,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.55,0.666667,ACCEPT
4,Kai,"Java, Spring Boot, SQL, Backend Development",0.31,0.333333,REJECT
6,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.21,0.0,REJECT
5,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.18,0.0,REJECT
7,May,"Data Analysis, Pandas, Visualization, Statistics",0.18,0.0,REJECT
0,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.11,0.0,REJECT
3,Nok,"Python, Django, Authentication, REST, Software...",0.1,0.0,REJECT
1,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.1,0.0,REJECT


In [29]:
context = {
    "role": "data",
    "required_skills": project_row["role_skill_map"]["datasci"],
    "student_skills": row["skills"],
    "score": row["score"],
    "coverage": row["coverage"],
    "decision": row["decision"]
}

context


{'role': 'data',
 'required_skills': ['python', 'machine learning', 'django', 'rest api'],
 'student_skills': 'Python, Flask, Data Engineering, SQL, ETL Pipelines',
 'score': np.float64(0.52),
 'coverage': np.float64(0.25),
 'decision': 'UPSKILL'}