In [9]:
import pandas as pd
import numpy as np

df = pd.read_csv("skills_dataset.csv")

def collect_columns(prefix):
    cols = [c for c in df.columns if c.startswith(prefix)]
    return df[cols].fillna("").agg(", ".join, axis=1)

df["prerequisites_text"] = collect_columns("prerequisites/")
df["complementary_text"] = collect_columns("complementary_skills/")
df["industry_text"] = collect_columns("industry_usage/")


In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

# Optional fine-tuning configuration. Set `do_finetune = True` to run epochs.
do_finetune = False
finetune_epochs = 30
finetune_batch_size = 16
save_finetuned_model = False
finetuned_model_path = "fine_tuned_sbert"

if do_finetune:
    from sentence_transformers import InputExample, losses
    from torch.utils.data import DataLoader
    # Prepare positive pairs: skill text <-> complementary/prerequisite context
    examples = []
    for _, r in df.iterrows():
        skill_text = f"Skill: {r['skill_name']} Category: {r['category']} Industry: {r['industry_text']}"
        pos = r['complementary_text'] if pd.notna(r['complementary_text']) and str(r['complementary_text']).strip() else r['prerequisites_text']
        if pd.isna(pos) or not str(pos).strip():
            continue
        pos_text = f"Context: {pos}"
        examples.append(InputExample(texts=[skill_text, pos_text]))

    if len(examples) == 0:
        print("No training examples found; skipping fine-tune")
    else:
        train_dataloader = DataLoader(examples, shuffle=True, batch_size=finetune_batch_size)
        train_loss = losses.MultipleNegativesRankingLoss(model)
        warmup_steps = max(100, int(len(train_dataloader) * finetune_epochs * 0.1))
        model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=finetune_epochs, warmup_steps=warmup_steps)
        if save_finetuned_model:
            model.save(finetuned_model_path)

def build_weighted_embedding(row):
    skill_emb = model.encode(
        f"Skill: {row['skill_name']}",
        normalize_embeddings=True
    )

    context_emb = model.encode(
        f"""
        Category: {row['category']}
        Industry Usage: {row['industry_text']}
        Complementary Skills: {row['complementary_text']}
        """,
        normalize_embeddings=True
    )

    market_emb = model.encode(
        f"""
        Job Demand Score: {row['job_demand_score']}
        Future Relevance Score: {row['future_relevance_score']}
        Market Trend: {row['market_trend']}
        """,
        normalize_embeddings=True
    )

    return (
        0.5 * skill_emb +
        0.3 * context_emb +
        0.2 * market_emb
    )

df["embedding"] = df.apply(build_weighted_embedding, axis=1)
embeddings = np.vstack(df["embedding"].values)


In [11]:
import faiss

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)


In [18]:

def filter_by_prerequisites(df, user_skills):
    # Keep this helper if you want to strictly filter out skills that require
    # prerequisites the user doesn't have. For recommendations based on
    # matching user skills we'd rather compute overlap than hard-filter.
    def prereq_ok(prereqs):
        if not prereqs:
            return True
        prereq_list = [p.strip().lower() for p in prereqs.split(",")]
        return any(skill.lower() in prereq_list for skill in user_skills)

    # Return boolean mask if strict filtering is needed elsewhere
    return df[df["prerequisites_text"].apply(prereq_ok)]


In [19]:
# User profile embeddings removed.
# Recommendations build a query embedding by averaging existing skill embeddings
# for the user's known skills (avoids encoding arbitrary user text).


In [20]:
def structured_score(row, similarity, match_score=0.0):
    # Combine semantic similarity, market signals and explicit skill matches.
    # We give extra weight to match_score so user skills influence ranking.
    sim_w = 0.45
    demand_w = 0.15
    future_w = 0.15
    match_w = 0.25
    return (
        sim_w * similarity +
        demand_w * (row["job_demand_score"] / 100) +
        future_w * (row["future_relevance_score"] / 100) +
        match_w * match_score
    )


In [21]:
def recommend_skills(
    user_skills,
    top_k=10
):

    # Build a query embedding by averaging embeddings of the user's known skills
    user_set = set([s.strip().lower() for s in user_skills if s.strip()])
    mask = df['skill_name'].fillna('').str.lower().isin(user_set)
    if mask.any():
        user_query_emb = np.vstack(df.loc[mask, 'embedding'].values).mean(axis=0)
    else:
        # fallback: use mean of all embeddings (less specific)
        user_query_emb = np.mean(embeddings, axis=0)

    # retrieve a larger candidate pool from FAISS and re-rank using structured signals
    scores, indices = index.search(
        user_query_emb.reshape(1, -1),
        k=200  # retrieve more, re-rank and dedupe below
    )

    candidates = df.iloc[indices[0]].copy().reset_index(drop=True)
    candidates["similarity"] = scores[0]

    # compute overlaps between user skills and candidate text fields
    user_set = set([s.strip().lower() for s in user_skills if s.strip()])

    def overlap_count(text):
        if not text or pd.isna(text):
            return 0
        toks = set([t.strip().lower() for t in text.split(",") if t.strip()])
        return len(user_set & toks)

    candidates["prereq_overlap"] = candidates["prerequisites_text"].fillna("").apply(overlap_count)
    candidates["comp_overlap"] = candidates["complementary_text"].fillna("").apply(overlap_count)
    candidates["skill_name_match"] = candidates["skill_name"].fillna("").apply(lambda s: 1 if s.strip().lower() in user_set else 0)

    denom = max(1, len(user_set))
    # weighted match score: favour exact skill-name matches, then prereqs, then complementary skills
    candidates["match_score"] = (
        0.6 * candidates["skill_name_match"] +
        0.3 * (candidates["prereq_overlap"] / denom) +
        0.1 * (candidates["comp_overlap"] / denom)
    )

    # apply structured scoring that includes the match_score
    candidates["final_score"] = candidates.apply(
        lambda r: structured_score(r, r["similarity"], r["match_score"]),
        axis=1
    )

    # dedupe by skill name and return top-k
    result = candidates.sort_values("final_score", ascending=False).drop_duplicates(subset=["skill_name"]).head(top_k).reset_index(drop=True)
    return result


In [25]:

user_skills = ["Python", "DSA"]

recommendations = recommend_skills( user_skills, top_k=20)
print(recommendations['skill_name'].tolist())

['Python', 'Python Programming', 'Django', 'Python Programming (for ML)', 'Machine Learning Frameworks (TensorFlow, PyTorch)', 'Artificial Intelligence (AI) & Machine Learning (ML)', 'Flask', 'Deep Learning Frameworks (TensorFlow, PyTorch)', 'Artificial Intelligence (AI) Programming', 'Data Structures & Algorithms (DSA)', 'Scikit-learn', 'Data Structures & Algorithms', 'Scripting Languages (Shell, Python)', 'Python for Finance', 'Scripting Languages (Python, Go, Ruby)', 'Regression Analysis', 'R (Statistical Programming Language)', 'Algorithms', 'Data Structures', 'TensorFlow']
