<a href="https://colab.research.google.com/github/Akarsh-Rajgit/Placemate/blob/main/hybrid_job_recomend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install --upgrade --force-reinstall "numpy<2.0" pandas scikit-learn scikit-surprise==1.1.4


Collecting numpy<2.0
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scikit-surprise==1.1.4
  Using cached scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl
Collecting joblib>=1.2.0 (from scikit-surprise==1.1.4)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting scipy>=1.6.0 (from scikit-surprise==1.1.4)
  Downloading scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdat

In [1]:
import os, string, pickle, json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from surprise import Dataset as SurpriseDataset
from surprise import Reader, SVD

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.replace("\n", " ").replace("\r", " ")
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join(ch for ch in text if not ch.isdigit())
    return text


In [2]:
class ContentModels:
    def __init__(self, max_features=7000, ngram_range=(1, 2), stop_words="english"):
        self.vectorizer = TfidfVectorizer(max_features=max_features,
                                          ngram_range=ngram_range,
                                          stop_words=stop_words)
        self.nb = MultinomialNB()
        self.lr = LogisticRegression(max_iter=1000, random_state=42)
        self._fitted = False

    def fit(self, resumes, labels):
        resumes = resumes.apply(clean_text)
        X = self.vectorizer.fit_transform(resumes)
        self.nb.fit(X, labels)
        self.lr.fit(X, labels)
        self._fitted = True

    def evaluate(self, resumes, labels):
        X = self.vectorizer.transform(resumes.apply(clean_text))
        nb_acc = accuracy_score(labels, self.nb.predict(X))
        lr_acc = accuracy_score(labels, self.lr.predict(X))
        return {"nb_accuracy": nb_acc, "lr_accuracy": lr_acc}

    def content_scores(self, resume_text: str):
        X_new = self.vectorizer.transform([clean_text(resume_text)])
        nb_probs = self.nb.predict_proba(X_new)[0]
        lr_probs = self.lr.predict_proba(X_new)[0]
        labels = self.nb.classes_
        probs = (nb_probs + lr_probs) / 2.0
        return {label: float(p) for label, p in zip(labels, probs)}


In [3]:
class CFModelSVD:
    def __init__(self):
        self.reader = Reader(rating_scale=(0.0, 1.0))
        self.algo = None

    def fit(self, interactions_df):
        data = SurpriseDataset.load_from_df(interactions_df[['user_id','job_id','rating']], self.reader)
        trainset = data.build_full_trainset()
        self.algo = SVD()
        self.algo.fit(trainset)

    def predict_scores(self, user_id, items):
        scores = {}
        for job in items:
            est = self.algo.predict(str(user_id), str(job)).est
            scores[job] = float(est)
        return scores


In [4]:
class CFModelSVD:
    def __init__(self):
        self.reader = Reader(rating_scale=(0.0, 1.0))
        self.algo = None

    def fit(self, interactions_df):
        data = SurpriseDataset.load_from_df(interactions_df[['user_id','job_id','rating']], self.reader)
        trainset = data.build_full_trainset()
        self.algo = SVD()
        self.algo.fit(trainset)

    def predict_scores(self, user_id, items):
        scores = {}
        for job in items:
            est = self.algo.predict(str(user_id), str(job)).est
            scores[job] = float(est)
        return scores


In [5]:
class HybridRecommender:
    def __init__(self, content, cf=None, alpha=0.6):
        self.content = content
        self.cf = cf
        self.alpha = alpha

    def recommend(self, resume_text=None, user_id=None, job_labels=None, top_k=10):
        if job_labels is None:
            job_labels = list(self.content.nb.classes_)

        content_scores = {j: 0.0 for j in job_labels}
        if resume_text:
            content_scores = self.content.content_scores(resume_text)

        cf_scores = {j: 0.0 for j in job_labels}
        if self.cf and user_id is not None:
            cf_scores = self.cf.predict_scores(user_id, job_labels)

        final = {j: self.alpha * content_scores.get(j,0) + (1-self.alpha)*cf_scores.get(j,0) for j in job_labels}
        ranked = sorted(final.items(), key=lambda x:x[1], reverse=True)[:top_k]

        results = []
        for job,score in ranked:
            why = []
            if content_scores.get(job,0)>0: why.append("Skills/keywords matched resume")
            if cf_scores.get(job,0)>0: why.append("Similar users applied here")
            results.append({"job":job,"score":score,"why":why})
        return results


In [13]:
import random

job_categories = df['Category'].unique().tolist()

num_users = 50
interactions_list = []

action_weights = {
    'applied': 1.0,
    'saved': 0.7,
    'viewed': 0.5,
    'searched': 0.3,
}

for user_id in range(1, num_users+1):
    sampled_jobs = random.sample(job_categories, k=min(len(job_categories), random.randint(5, 10)))
    for job in sampled_jobs:
        action = random.choice(list(action_weights.keys()))
        rating = action_weights[action]
        interactions_list.append([user_id, job, rating])

interactions = pd.DataFrame(interactions_list, columns=['user_id', 'job_id', 'rating'])

print("✅ Dummy interactions dataset created.")
print(interactions.head())


✅ Dummy interactions dataset created.
   user_id              job_id  rating
0        1    Python Developer     0.3
1        1       SAP Developer     0.5
2        1  Operations Manager     1.0
3        1             Testing     0.7
4        1  Health and fitness     0.3


In [14]:
cf_model = CFModelSVD()
cf_model.fit(interactions)
print("✅ CF model trained on dummy data.")


✅ CF model trained on dummy data.


In [18]:
import datetime

try:
    interactions = pd.read_csv("interactions.csv")
except FileNotFoundError:
    interactions = pd.DataFrame(columns=["user_id", "job_id", "rating", "timestamp"])

ACTION_WEIGHTS = {
    "applied": 1.0,
    "saved": 0.7,
    "viewed": 0.5,
    "searched": 0.3,
}

def log_interaction(user_id, job_id, action):
    """Log a student action (applied, saved, viewed, searched)."""
    rating = ACTION_WEIGHTS.get(action, 0.3)
    timestamp = datetime.datetime.now().isoformat()

    new_entry = pd.DataFrame([[user_id, job_id, rating, timestamp]],
                             columns=["user_id", "job_id", "rating", "timestamp"])

    global interactions
    interactions = pd.concat([interactions, new_entry], ignore_index=True)

    interactions.to_csv("interactions.csv", index=False)

    print(f"✅ Logged: user {user_id} {action} job {job_id} (rating={rating})")


In [19]:
cf_model = CFModelSVD()
cf_model.fit(interactions)
print("✅ CF model retrained with new interactions.")


✅ CF model retrained with new interactions.


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [20]:
log_interaction(user_id=42, job_id="Data Science", action="searched")

log_interaction(user_id=42, job_id="Machine Learning Engineer", action="applied")

# Retrain CF
cf_model.fit(interactions)

# Now recommend for user 42
resume_text = "Student skilled in Python, ML, and data analysis projects."
recs = hybrid.recommend(resume_text, user_id=42, top_k=5)

print("\n🎯 Updated Hybrid Recommendations for user 42:\n")
for r in recs:
    print(f"- {r['job']} | {r['score']:.4f} | Why: {', '.join(r['why'])}")


✅ Logged: user 42 searched job Data Science (rating=0.3)
✅ Logged: user 42 applied job Machine Learning Engineer (rating=1.0)

🎯 Updated Hybrid Recommendations for user 42:

- Testing | 0.3218 | Why: Skills/keywords matched resume, Similar users applied here
- Python Developer | 0.3154 | Why: Skills/keywords matched resume, Similar users applied here
- PMO | 0.3057 | Why: Skills/keywords matched resume, Similar users applied here
- Web Designing | 0.2943 | Why: Skills/keywords matched resume, Similar users applied here
- Data Science | 0.2919 | Why: Skills/keywords matched resume, Similar users applied here


  interactions = pd.concat([interactions, new_entry], ignore_index=True)


In [15]:
test_user = 10
job_labels = job_categories[:5]  # just pick first 5 categories for display

cf_scores = cf_model.predict_scores(test_user, job_labels)

print(f"📌 Collaborative predictions for user {test_user}:")
for job, score in cf_scores.items():
    print(f"- {job}: {score:.4f}")

📌 Collaborative predictions for user 10:
- Data Science: 0.5614
- HR: 0.6388
- Advocate: 0.6595
- Arts: 0.6629
- Web Designing: 0.6820


In [16]:
resume_text = """Experienced software engineer skilled in Python, Flask, and machine learning.
Worked on full stack projects, chatbots, and data science pipelines."""

hybrid = HybridRecommender(content_model, cf_model, alpha=0.6)

recs = hybrid.recommend(resume_text, user_id=10, top_k=5)

print("\n🎯 Hybrid Recommendations (with dummy CF):\n")
for r in recs:
    print(f"- {r['job']} | {r['score']:.4f} | Why: {', '.join(r['why'])}")



🎯 Hybrid Recommendations (with dummy CF):

- Data Science | 0.4252 | Why: Skills/keywords matched resume, Similar users applied here
- Testing | 0.3184 | Why: Skills/keywords matched resume, Similar users applied here
- PMO | 0.2988 | Why: Skills/keywords matched resume, Similar users applied here
- Python Developer | 0.2899 | Why: Skills/keywords matched resume, Similar users applied here
- Web Designing | 0.2879 | Why: Skills/keywords matched resume, Similar users applied here


In [11]:
! pip install docx2txt pdfplumber

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m86.3 MB/s[0m  [33m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m108.5 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: docx2txt, pypdfium2, 

In [26]:
from google.colab import files
import docx2txt
import pdfplumber

# Upload a resume file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Extract text from DOCX or PDF
resume_text = ""
if filename.endswith(".docx"):
    resume_text = docx2txt.process(filename)
elif filename.endswith(".pdf"):
    with pdfplumber.open(filename) as pdf:
        for page in pdf.pages:
            resume_text += page.extract_text() + "\n"
else:
    with open(filename, "r", encoding="utf-8") as f:
        resume_text = f.read()

print("✅ Resume text extracted successfully.")

# 🔹 Cold-start handling: if no user_id available, set to None
user_id = 4  # change to an int if you have logs (e.g., 10), else keep None

# Run hybrid recommender
recs = hybrid.recommend(resume_text, user_id=user_id, top_k=5)

print("\n🎯 Top Job Recommendations:\n")
for r in recs:
    print(f"- {r['job']} | score={r['score']:.4f} | why: {', '.join(r['why']) if r['why'] else 'Based on skills only'}")


Saving Sudhamani-M-J.pdf to Sudhamani-M-J.pdf
✅ Resume text extracted successfully.

🎯 Top Job Recommendations:

- Data Science | score=0.3663 | why: Skills/keywords matched resume, Similar users applied here
- Testing | score=0.3201 | why: Skills/keywords matched resume, Similar users applied here
- Java Developer | score=0.3012 | why: Skills/keywords matched resume, Similar users applied here
- PMO | score=0.3004 | why: Skills/keywords matched resume, Similar users applied here
- Web Designing | score=0.2914 | why: Skills/keywords matched resume, Similar users applied here
