In [1]:
!pip install --upgrade --force-reinstall "numpy<2.0" pandas scikit-learn scikit-surprise==1.1.4


Collecting numpy<2.0
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scikit-surprise==1.1.4
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?

In [1]:
import os, string, pickle, json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from surprise import Dataset as SurpriseDataset
from surprise import Reader, SVD

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.replace("\n", " ").replace("\r", " ")
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join(ch for ch in text if not ch.isdigit())
    return text


In [2]:
class ContentModels:
    def __init__(self, max_features=7000, ngram_range=(1, 2), stop_words="english"):
        self.vectorizer = TfidfVectorizer(max_features=max_features,
                                          ngram_range=ngram_range,
                                          stop_words=stop_words)
        self.nb = MultinomialNB()
        self.lr = LogisticRegression(max_iter=1000, random_state=42)
        self._fitted = False

    def fit(self, resumes, labels):
        resumes = resumes.apply(clean_text)
        X = self.vectorizer.fit_transform(resumes)
        self.nb.fit(X, labels)
        self.lr.fit(X, labels)
        self._fitted = True

    def evaluate(self, resumes, labels):
        X = self.vectorizer.transform(resumes.apply(clean_text))
        nb_acc = accuracy_score(labels, self.nb.predict(X))
        lr_acc = accuracy_score(labels, self.lr.predict(X))
        return {"nb_accuracy": nb_acc, "lr_accuracy": lr_acc}

    def content_scores(self, resume_text: str):
        X_new = self.vectorizer.transform([clean_text(resume_text)])
        nb_probs = self.nb.predict_proba(X_new)[0]
        lr_probs = self.lr.predict_proba(X_new)[0]
        labels = self.nb.classes_
        probs = (nb_probs + lr_probs) / 2.0
        return {label: float(p) for label, p in zip(labels, probs)}


In [4]:
class CFModelSVD:
    def __init__(self):
        self.reader = Reader(rating_scale=(0.0, 1.0))
        self.algo = None

    def fit(self, interactions_df):
        data = SurpriseDataset.load_from_df(interactions_df[['user_id','job_id','rating']], self.reader)
        trainset = data.build_full_trainset()
        self.algo = SVD()
        self.algo.fit(trainset)

    def predict_scores(self, user_id, items):
        scores = {}
        for job in items:
            est = self.algo.predict(str(user_id), str(job)).est
            scores[job] = float(est)
        return scores


In [5]:
class HybridRecommender:
    def __init__(self, content, cf=None, alpha=0.6):
        self.content = content
        self.cf = cf
        self.alpha = alpha

    def recommend(self, resume_text=None, user_id=None, job_labels=None, top_k=10):
        if job_labels is None:
            job_labels = list(self.content.nb.classes_)

        content_scores = {j: 0.0 for j in job_labels}
        if resume_text:
            content_scores = self.content.content_scores(resume_text)

        cf_scores = {j: 0.0 for j in job_labels}
        if self.cf and user_id is not None:
            cf_scores = self.cf.predict_scores(user_id, job_labels)

        final = {j: self.alpha * content_scores.get(j,0) + (1-self.alpha)*cf_scores.get(j,0) for j in job_labels}
        ranked = sorted(final.items(), key=lambda x:x[1], reverse=True)[:top_k]

        results = []
        for job,score in ranked:
            why = []
            if content_scores.get(job,0)>0: why.append("Skills/keywords matched resume")
            if cf_scores.get(job,0)>0: why.append("Similar users applied here")
            results.append({"job":job,"score":score,"why":why})
        return results


In [6]:
import pandas as pd

file_id = "1gIROkSSuRwqEqAk2o_IH60fUoiAL71PG"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url, encoding="utf-8")
print("✅ Dataset loaded:", df.shape)
print(df.head())


✅ Dataset loaded: (962, 2)
       Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2  Data Science  Areas of Interest Deep Learning, Control Syste...
3  Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4  Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...


In [7]:
import random

job_categories = df['Category'].unique().tolist()

num_users = 50
interactions_list = []

action_weights = {
    'applied': 1.0,
    'saved': 0.7,
    'viewed': 0.5,
    'searched': 0.3,
}

for user_id in range(1, num_users+1):
    sampled_jobs = random.sample(job_categories, k=min(len(job_categories), random.randint(5, 10)))
    for job in sampled_jobs:
        action = random.choice(list(action_weights.keys()))
        rating = action_weights[action]
        interactions_list.append([user_id, job, rating])

interactions = pd.DataFrame(interactions_list, columns=['user_id', 'job_id', 'rating'])

print("✅ Dummy interactions dataset created.")
print(interactions.head())


✅ Dummy interactions dataset created.
   user_id              job_id  rating
0        1    Python Developer     0.7
1        1     DevOps Engineer     0.5
2        1      Civil Engineer     0.7
3        1              Hadoop     0.7
4        1  Operations Manager     0.7


In [8]:
cf_model = CFModelSVD()
cf_model.fit(interactions)
print("✅ CF model trained on dummy data.")


✅ CF model trained on dummy data.


In [9]:
import datetime

try:
    interactions = pd.read_csv("interactions.csv")
except FileNotFoundError:
    interactions = pd.DataFrame(columns=["user_id", "job_id", "rating", "timestamp"])

ACTION_WEIGHTS = {
    "applied": 1.0,
    "saved": 0.7,
    "viewed": 0.5,
    "searched": 0.3,
}

def log_interaction(user_id, job_id, action):
    """Log a student action (applied, saved, viewed, searched)."""
    rating = ACTION_WEIGHTS.get(action, 0.3)
    timestamp = datetime.datetime.now().isoformat()

    new_entry = pd.DataFrame([[user_id, job_id, rating, timestamp]],
                             columns=["user_id", "job_id", "rating", "timestamp"])

    global interactions
    interactions = pd.concat([interactions, new_entry], ignore_index=True)

    interactions.to_csv("interactions.csv", index=False)

    print(f"✅ Logged: user {user_id} {action} job {job_id} (rating={rating})")


In [10]:
cf_model = CFModelSVD()
cf_model.fit(interactions)
print("✅ CF model retrained with new interactions.")


✅ CF model retrained with new interactions.


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [11]:
# ✅ Step 1: Define relevant technical categories
technical_jobs = [
    "Data Science", "Python Developer", "Java Developer", "AI", "Machine Learning Engineer",
    "Web Developer", "Software Engineer", "C++ Developer", "Testing", "DevOps", "Blockchain",
    "Database Administrator", "Full Stack Developer"
]

# ✅ Step 2: Filter dataset
print("Before filtering:", df['Category'].nunique(), "categories")
df = df[df['Category'].isin(technical_jobs)].reset_index(drop=True)
print("After filtering:", df['Category'].nunique(), "categories")
print("Remaining categories:\n", df['Category'].value_counts())


Before filtering: 25 categories
After filtering: 5 categories
Remaining categories:
 Category
Java Developer      84
Testing             70
Python Developer    48
Data Science        40
Blockchain          40
Name: count, dtype: int64


In [12]:
# --- Train content-based models again (if not already in memory) ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# X and y from your dataset
texts = df['Resume']          # resume text
labels = df['Category']       # job categories

# Vectorizer
vectorizer = TfidfVectorizer(max_features=7000, stop_words="english")
X = vectorizer.fit_transform(texts)

# Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X, labels)

print("✅ Content-based model trained.")


✅ Content-based model trained.


In [13]:
# Hybrid Recommendation Engine
class HybridRecommendationEngine:
    def __init__(self, content_model, cf_model, vectorizer):
        self.content_model = content_model
        self.cf_model = cf_model
        self.vectorizer = vectorizer

    def recommend(self, resume_text, user_id=None, top_k=5):
      # Content-based scores
      X_new = self.vectorizer.transform([resume_text])
      content_probs = self.content_model.predict_proba(X_new).mean(axis=0)
      content_scores = dict(zip(self.content_model.classes_, content_probs))

      # Collaborative scores
      cf_scores = {}
      if user_id is not None and hasattr(self.cf_model, "predict_scores"):
          cf_scores = self.cf_model.predict_scores(user_id, list(content_scores.keys()))

      # Dynamic weighting
      final_scores = {}
      for job, c_score in content_scores.items():
          # If resume signal is strong, trust content more (80-90%)
          if c_score > 0.15:
              alpha = 0.85
          else:
              alpha = 0.6   # if weak content signal, lean more on CF

          score = alpha * c_score + (1 - alpha) * cf_scores.get(job, 0)
          final_scores[job] = (score, c_score, cf_scores.get(job, 0))

      # Rank results
      ranked = sorted(final_scores.items(), key=lambda x: x[1][0], reverse=True)[:top_k]

      # Explain results
      results = []
      for job, (score, c_score, cf_score) in ranked:
          why = []
          if c_score > 0: why.append("Skills/keywords matched resume")
          if cf_score > 0: why.append("Similar users applied/searched this job")
          results.append({"job": job, "score": score, "why": why})
      return results




# ✅ Use already-trained models/vectorizer
hybrid = HybridRecommendationEngine(logreg_model, cf_model, vectorizer)


# --- Example test ---
log_interaction(user_id=42, job_id="Data Science", action="searched")
log_interaction(user_id=42, job_id="Machine Learning Engineer", action="applied")

# Retrain CF on updated logs
cf_model.fit(interactions)

# Get hybrid recommendations
resume_text = "Student skilled in Python, ML, and data analysis projects."
recs = hybrid.recommend(resume_text, user_id=42, top_k=5)

print("\n🎯 Updated Hybrid Recommendations for user 42:\n")
for r in recs:
    print(f"- {r['job']} | {r['score']:.4f} | Why: {', '.join(r['why'])}")

✅ Logged: user 42 searched job Data Science (rating=0.3)
✅ Logged: user 42 applied job Machine Learning Engineer (rating=1.0)

🎯 Updated Hybrid Recommendations for user 42:

- Data Science | 0.3986 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Blockchain | 0.3193 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Python Developer | 0.2873 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Java Developer | 0.2346 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Testing | 0.2299 | Why: Skills/keywords matched resume, Similar users applied/searched this job


  interactions = pd.concat([interactions, new_entry], ignore_index=True)


In [14]:
# ✅ Use already-trained models/vectorizer
hybrid = HybridRecommendationEngine(logreg_model, cf_model, vectorizer)


In [15]:
test_user = 10
job_labels = job_categories[:5]  # just pick first 5 categories for display

cf_scores = cf_model.predict_scores(test_user, job_labels)

print(f"📌 Collaborative predictions for user {test_user}:")
for job, score in cf_scores.items():
    print(f"- {job}: {score:.4f}")

📌 Collaborative predictions for user 10:
- Data Science: 0.6125
- HR: 0.6500
- Advocate: 0.6500
- Arts: 0.6500
- Web Designing: 0.6500


In [16]:
resume_text = """Experienced software engineer skilled in Python, Flask, and machine learning.
Worked on full stack projects, chatbots, and data science pipelines."""

# ✅ Use the HybridRecommendationEngine (the correct class in your notebook)
hybrid = HybridRecommendationEngine(logreg_model, cf_model, vectorizer)

recs = hybrid.recommend(resume_text, user_id=10, top_k=5)

print("\n🎯 Hybrid Recommendations (with CF):\n")
for r in recs:
    print(f"- {r['job']} | {r['score']:.4f} | Why: {', '.join(r['why'])}")



🎯 Hybrid Recommendations (with CF):

- Data Science | 0.5186 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Testing | 0.3424 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Java Developer | 0.3243 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Blockchain | 0.3192 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Python Developer | 0.2290 | Why: Skills/keywords matched resume, Similar users applied/searched this job


In [17]:
! pip install docx2txt pdfplumber

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━

In [21]:
from google.colab import files
import docx2txt
import pdfplumber

user_id = None
# Upload a resume file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Extract text from DOCX or PDF
resume_text = ""
if filename.endswith(".docx"):
    resume_text = docx2txt.process(filename)
elif filename.endswith(".pdf"):
    with pdfplumber.open(filename) as pdf:
        for page in pdf.pages:
            resume_text += page.extract_text() + "\n"
else:
    with open(filename, "r", encoding="utf-8") as f:
        resume_text = f.read()

print("✅ Resume text extracted successfully.")

# 🔹 Cold-start handling: if no user_id available, set to None  # change to an int if you have logs (e.g., 10), else keep None

# Run hybrid recommender
recs = hybrid.recommend(resume_text, user_id=user_id, top_k=5)

# --- Debugging Predictions ---
resume_text = """Experienced software engineer skilled in Python, Flask, and machine learning.
Worked on full stack projects, chatbots, and data science pipelines."""

# Content-only predictions
X_new = vectorizer.transform([resume_text])
content_probs = logreg_model.predict_proba(X_new)[0]
top_content = sorted(
    zip(logreg_model.classes_, content_probs),
    key=lambda x: x[1],
    reverse=True
)[:5]

print("\n")

print("📌 Content-based predictions:")
for job, score in top_content:
    print(f"- {job}: {score:.4f}")

# Collaborative-only predictions
# Fixed: Pass job_labels as 'items' argument
cf_scores = cf_model.predict_scores(user_id=10, items=logreg_model.classes_)
print("\n📌 Collaborative predictions:")
for job, score in sorted(cf_scores.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"- {job}: {score:.4f}")

# Hybrid predictions
recs = hybrid.recommend(resume_text, user_id=10, top_k=5)
print("\n📌 Hybrid predictions:")
for r in recs:
    print(f"- {r['job']} | {r['score']:.4f} | Why: {', '.join(r['why'])}")


Saving Dr-Hemanth-S.pdf to Dr-Hemanth-S (1).pdf
✅ Resume text extracted successfully.


📌 Content-based predictions:
- Data Science: 0.5020
- Python Developer: 0.1548
- Testing: 0.1374
- Java Developer: 0.1071
- Blockchain: 0.0987

📌 Collaborative predictions:
- Blockchain: 0.6500
- Java Developer: 0.6500
- Python Developer: 0.6500
- Testing: 0.6500
- Data Science: 0.6125

📌 Hybrid predictions:
- Data Science | 0.5186 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Testing | 0.3424 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Java Developer | 0.3243 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Blockchain | 0.3192 | Why: Skills/keywords matched resume, Similar users applied/searched this job
- Python Developer | 0.2290 | Why: Skills/keywords matched resume, Similar users applied/searched this job
