<a href="https://colab.research.google.com/github/ChongyeWang/job/blob/main/Job_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# DATA CLEANING

import pandas as pd
import numpy as np
import re
from datetime import datetime

# Load dataset (update the filename if necessary)
df = pd.read_csv("drive/MyDrive/short_cleaned_postings.csv")

# 1. Remove duplicates
df.drop_duplicates(inplace=True)

# 2. Handle missing values
# Fill missing skill descriptions with job title + description if available
df["skills_desc"].fillna(df["title"].astype(str) + " " + df["description"].astype(str), inplace=True)
df.dropna(subset=["skills_desc"], inplace=True)  # Drop rows where skills_desc is still missing

# 3. Convert salary to numerical
def convert_salary(row):
    if row["pay_period"] == "HOURLY":
        return row["max_salary"] * 2080  # Convert hourly wage to annual
    return row["max_salary"]  # Assume yearly salary is already in the right format

df["normalized_salary"] = df.apply(lambda row: convert_salary(row) if not pd.isnull(row["max_salary"]) else np.nan, axis=1)

# 4. Convert dates from scientific notation to readable format
def convert_timestamp(ts):
    try:
        return datetime.utcfromtimestamp(float(ts) / 1e3).strftime('%Y-%m-%d')
    except:
        return np.nan

df["listed_time"] = df["listed_time"].apply(convert_timestamp)

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"[^a-z0-9 ]", "", text)  # Remove special characters
        return text
    return ""

# 5. Normalize text fields
df["title"] = df["title"].apply(clean_text)
df["description"] = df["description"].apply(clean_text)
df["skills_desc"] = df["skills_desc"].apply(clean_text)

# Save cleaned dataset
df.to_csv("short_cleaned_postings_1.csv", index=False)
print("Dataset cleaned and saved as short_cleaned_postings_1.csv")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["skills_desc"].fillna(df["title"].astype(str) + " " + df["description"].astype(str), inplace=True)


Dataset cleaned and saved as short_cleaned_postings_1.csv


In [None]:
# to fill the missing values in max_salary
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("drive/MyDrive/short_cleaned_postings.csv")

# Ensure max_salary is numeric
df['max_salary'] = pd.to_numeric(df['max_salary'], errors='coerce')

# Get the non-null salary values
existing_salaries = df['max_salary'].dropna().values

# Number of missing values
num_missing = df['max_salary'].isnull().sum()

# Randomly sample values to fill the missing ones
random_salaries = np.random.choice(existing_salaries, size=num_missing)

# Fill missing values
df.loc[df['max_salary'].isnull(), 'max_salary'] = random_salaries

# Optional: Save the updated max_salary column only
df[['max_salary']].to_csv("updated_max_salary_only.csv", index=False)

# Or if you want the full dataset updated
df.to_csv("short_cleaned_postings_filled.csv", index=False)

print("✅ Missing 'max_salary' values filled randomly and saved.")


✅ Missing 'max_salary' values filled randomly and saved.


# The Actual Recommendation Model - Sentence Transformer

In [5]:
import pandas as pd
# import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer, util
! pip install pdf2docx  # this is a library which is used to convert the pdf into word




# Load dataset
df = pd.read_csv("/content/drive/MyDrive/short_cleaned_postings.csv")

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')  # small and fast

# Encode job descriptions
job_embeddings = model.encode(df['description'].tolist(), convert_to_tensor=True)


# converting the pdf to docx
from pdf2docx import Converter

pdf_file = '/content/drive/MyDrive/Resume.pdf'
docx_file = 'Resume.docx'

cv = Converter(pdf_file)
cv.convert(docx_file)  # Entire document
cv.close()

''' Load and process resume from PDF
with fitz.open("Resume.pdf") as doc:
    resume_text = ""
    for page in doc:
        resume_text += page.get_text()'''
from docx import Document

doc = Document('Resume.docx')
resume_text = ""
for para in doc.paragraphs:
    resume_text += para.text + " "


resume_embedding = model.encode(resume_text, convert_to_tensor=True)

# Compute similarity
cosine_scores = util.cos_sim(resume_embedding, job_embeddings)[0].cpu().numpy()
df['similarity_score'] = cosine_scores

# Top 5 recommendations
top_matches = df.sort_values(by='similarity_score', ascending=False).head(5)
print(top_matches[['title', 'company_name', 'work_type', 'location', 'zip_code',
                   'max_salary', 'formatted_work_type', 'skills_desc',
                   'similarity_score', 'description']])


                             title        company_name  work_type  \
36     software support specialist  Recruitment Design  FULL_TIME   
486              frontend engineer              Prepto  FULL_TIME   
354  engineers  marketing  various         GlobalStake  FULL_TIME   
607   junior engineer  full remote           Joinrs US  FULL_TIME   
428          lead dotnet developer                   0  FULL_TIME   

            location  zip_code  max_salary formatted_work_type  \
36        McLean, VA   22101.0    65000.00           Full-time   
486  Santa Clara, CA   95050.0   125000.00           Full-time   
354    United States       NaN       16.00           Full-time   
607    United States       NaN    90000.00           Full-time   
428      Atlanta, GA   30303.0       20.98           Full-time   

                                           skills_desc  similarity_score  \
36   software support specialist are you driven by ...          0.588516   
486  frontend engineer the ideal can