In [1]:
import re
from ftfy import fix_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [2]:
# Load the NLTK stopwords
stopw = set(stopwords.words('english'))

In [3]:
# Load dataset:
jd_df = pd.read_csv(r'D:/ML_Projects/Job_Reccomendation_System/src/data/jd_structured_data.csv')
jd_df.head()

Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Competitors,Average Salary,Average Revenue,Processed_JD
0,Data Analyst,4.7,Matahari Department Store,Palopo,Mataram,1001 to 5000 employees,2002,Company - Public,Information Technology,Technology,-1.0,63.9,53.2,"Seeking a Data Analyst proficient in SQL, Data..."
1,Web Application Developer,3.1,Bukalapak,Pagar Alam,Lubuklinggau,5001 to 10000 employees,2006,Company - Public,E-commerce,Technology,,137.0,55.3,We are looking for a Web Application Developer...
2,Reinforcement Learning Specialist,3.7,Kalbe Farma,Jakarta,Jambi,1001 to 5000 employees,2009,Company - Private,Software,Business Services,-1.0,124.3,48.7,Join our team as a Reinforcement Learning Spec...
3,Business Intelligence Analyst,4.6,Bumi Resources,Sungai Penuh,Bukittinggi,10001+ employees,1999,Company - Private,Information Technology,Banking,-1.0,138.6,52.6,We need a Business Intelligence Analyst with s...
4,UI/UX Developer,4.3,Tokopedia,Sungai Penuh,Batu,1001 to 5000 employees,2004,Company - Public,Information Technology,Business Services,,55.1,86.0,Join us as a UI/UX Developer. Required skills:...


In [4]:
# Global variable to hold skills
skills = []

In [5]:
def set_skills(resume_skills):
    global skills
    if resume_skills:
        skills = [' '.join(word for word in resume_skills if word.lower() not in stopw)]
    else:
        skills = []

# Feature Engineering

In [6]:
def ngrams(string, n=3):
    string = fix_text(string)  # fix text
    string = string.encode("ascii", errors="ignore").decode()  # remove non-ascii chars
    string = string.lower()
    chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()  # normalize case - capitalize the start of each word
    string = re.sub(' +', ' ', string).strip()  # get rid of multiple spaces and replace with a single
    string = ' ' + string + ' '  # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [8]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)

# Job Recommender

In [9]:
def getNearestN(queryTFIDF_, nbrs):
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

In [10]:
def recommend_jobs(user_location):
    global skills
    
    if not skills or all(word in stopw for word in skills):
        raise ValueError("No valid skills extracted. Please check the resume content.")
    
    # Create a single document with skills for TF-IDF vectorization
    skills_doc = [' '.join(skills)]
    tfidf = vectorizer.fit_transform(skills_doc)
    
    # Filter jobs by location
    filtered_jobs = jd_df[jd_df['Location'].str.contains(user_location, case=False, na=False)]
    if filtered_jobs.empty:
        return pd.DataFrame()
    
    # Transform job descriptions
    jd_test = filtered_jobs['Processed_JD'].values.astype('U')
    jd_tfidf = vectorizer.transform(jd_test)
    
    # Fit NearestNeighbors on job descriptions
    nbrs = NearestNeighbors(n_neighbors=min(len(filtered_jobs), 5), n_jobs=-1).fit(jd_tfidf)
    
    # Find nearest neighbors to the skills vector
    distances, indices = getNearestN(tfidf, nbrs)
    
    matches = []
    for i in range(len(indices[0])):
        match_info = {
            'Match confidence': round(distances[0][i], 2),
            'Job Title': filtered_jobs.iloc[indices[0][i]]['Job Title'],
            'Company Name': filtered_jobs.iloc[indices[0][i]]['Company Name'],
            'Location': filtered_jobs.iloc[indices[0][i]]['Location'],
            'Industry': filtered_jobs.iloc[indices[0][i]]['Industry'],
            'Sector': filtered_jobs.iloc[indices[0][i]]['Sector'],
            'Average Salary': filtered_jobs.iloc[indices[0][i]]['Average Salary']
        }
        matches.append(match_info)
    
    matches_df = pd.DataFrame(matches)
    
    return matches_df.drop_duplicates().sort_values('Match confidence').head(10)