In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = pd.read_csv('students.csv')

In [7]:
df

Unnamed: 0,student_id,name,year,gpa,skills_text,current_assignments,max_capacity
0,S001,Somchai,4,3.45,"Python, Django, REST API, Machine Learning, Re...",1,3
1,S002,Suriya,3,3.8,"React, HTML, CSS, UX Design, Frontend Development",0,4
2,S003,Aom,2,3.2,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0,5
3,S004,Nok,4,3.6,"Python, Django, Authentication, REST, Software...",4,2
4,S005,Kai,3,3.1,"Java, Spring Boot, SQL, Backend Development",2,2
5,S006,Pim,4,3.75,"Python, NLP, Embeddings, Text Classification, ...",1,1
6,S007,Earth,3,3.0,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",3,5
7,S008,May,2,3.5,"Data Analysis, Pandas, Visualization, Statistics",0,6


In [8]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
class SkillEmbedderTool:
    """Tool 2: Creates semantic embeddings for skill texts."""
    
    def __init__(self, model):
        self.model = model
    
    def embed_text(self, text: str) -> np.ndarray:
        """Embed a single text (skill/requirement description)."""
        return self.model.encode(text, normalize_embeddings=True)
    
    def embed_dataframe(self, df: pd.DataFrame, text_col: str) -> pd.DataFrame:
        """Add embedding column to dataframe."""
        result = df.copy()
        result["embedding"] = result[text_col].apply(self.embed_text)
        return result

skill_embedder = SkillEmbedderTool(embedding_model)

In [10]:
class StudentRankerTool:
    """
    TOOL 2: Ranks students by semantic skill match against project requirements.
    Uses cosine similarity on embeddings + coverage metrics.
    """
    
    def rank_students(
        self,
        project_row: pd.Series,
        students_df: pd.DataFrame
    ) -> pd.DataFrame:
        """
        Rank students by semantic and coverage score against project requirements.
        
        Args:
            project_row: Project specification as pd.Series with 'required_skills_text'
            students_df: DataFrame of students with 'skills_text' column
        
        Returns:
            DataFrame sorted by semantic score (descending)
        """
        # Embed project
        project_emb = skill_embedder.embed_text(
            project_row["required_skills_text"]
        )

        results = []

        for _, student in students_df.iterrows():
            student_emb = skill_embedder.embed_text(student["skills_text"])

            score = float(cosine_similarity(
                [student_emb], [project_emb]
            )[0][0])

            coverage = self.compute_coverage(
                student["skills_text"],
                project_row["required_skills_text"]
            )
            
            print(
                student["student_id"],
                round(score, 3),
                round(coverage, 2),
                student["skills_text"]
            )

            results.append({
                "student_id": student["student_id"],
                "student_name": student["name"],
                "student_skills_text": student["skills_text"],
                "score": round(score, 3),
                "coverage": round(coverage, 2),
                "current_assignments": student["current_assignments"],
                "max_capacity": student["max_capacity"]
            })

        return (
            pd.DataFrame(results)
            .sort_values("score", ascending=False)
            .reset_index(drop=True)
        )
    
    @staticmethod
    def compute_coverage(student_skills: str, required_skills: str) -> float:
        """
        Compute fraction of required skills present in student skills.
        Simple heuristic: split by comma and check substring matches.
        """
        student_set = {s.strip().lower() for s in student_skills.split(",") if s.strip()}
        required_set = {s.strip().lower() for s in required_skills.split(",") if s.strip()}
        
        if not required_set:
            return 1.0
        
        matches = sum(1 for req in required_set if any(req in s for s in student_set))
        return matches / len(required_set)

student_ranker = StudentRankerTool()

In [11]:
df_with_emb = skill_embedder.embed_dataframe(
    df,
    text_col="skills_text"
)
df_with_emb.head()

Unnamed: 0,student_id,name,year,gpa,skills_text,current_assignments,max_capacity,embedding
0,S001,Somchai,4,3.45,"Python, Django, REST API, Machine Learning, Re...",1,3,"[-0.09272324, -0.059916634, 0.008760903, 0.061..."
1,S002,Suriya,3,3.8,"React, HTML, CSS, UX Design, Frontend Development",0,4,"[-0.03667861, 0.01048227, 0.0058068405, -0.003..."
2,S003,Aom,2,3.2,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0,5,"[-0.0672544, -0.02639541, 0.001569687, 0.02973..."
3,S004,Nok,4,3.6,"Python, Django, Authentication, REST, Software...",4,2,"[-0.054426126, -0.0016615324, -0.0050594355, -..."
4,S005,Kai,3,3.1,"Java, Spring Boot, SQL, Backend Development",2,2,"[-0.0070108483, -0.07133077, -0.0392656, -0.02..."


In [14]:
projects_df = pd.DataFrame([
    {
        "project_id": 1,
        "required_skills_text": "python, machine learning, data analysis"
    }
])
projects_df

Unnamed: 0,project_id,required_skills_text
0,1,"python, machine learning, data analysis"


In [None]:
project_row = projects_df.iloc[0]

In [16]:
project_row

project_id                                                    1
required_skills_text    python, machine learning, data analysis
Name: 0, dtype: object

In [17]:
student_ranker = StudentRankerTool()

In [18]:
ranked_students = student_ranker.rank_students(
    project_row=project_row,
    students_df=df
)

S001 0.593 0.67 Python, Django, REST API, Machine Learning, Regression Modeling
S002 0.122 0.0 React, HTML, CSS, UX Design, Frontend Development
S003 0.486 0.33 Python, Flask, Data Engineering, SQL, ETL Pipelines
S004 0.412 0.33 Python, Django, Authentication, REST, Software Engineering
S005 0.063 0.0 Java, Spring Boot, SQL, Backend Development
S006 0.534 0.33 Python, NLP, Embeddings, Text Classification, Transformers
S007 0.122 0.0 DevOps, Docker, Kubernetes, CI/CD, Cloud Infrastructure
S008 0.583 0.33 Data Analysis, Pandas, Visualization, Statistics


In [19]:
ranked_students

Unnamed: 0,student_id,student_name,student_skills_text,score,coverage,current_assignments,max_capacity
0,S001,Somchai,"Python, Django, REST API, Machine Learning, Re...",0.593,0.67,1,3
1,S008,May,"Data Analysis, Pandas, Visualization, Statistics",0.583,0.33,0,6
2,S006,Pim,"Python, NLP, Embeddings, Text Classification, ...",0.534,0.33,1,1
3,S003,Aom,"Python, Flask, Data Engineering, SQL, ETL Pipe...",0.486,0.33,0,5
4,S004,Nok,"Python, Django, Authentication, REST, Software...",0.412,0.33,4,2
5,S002,Suriya,"React, HTML, CSS, UX Design, Frontend Development",0.122,0.0,0,4
6,S007,Earth,"DevOps, Docker, Kubernetes, CI/CD, Cloud Infra...",0.122,0.0,3,5
7,S005,Kai,"Java, Spring Boot, SQL, Backend Development",0.063,0.0,2,2
