In [10]:
import re
from ftfy import fix_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [11]:
# Load the NLTK stopwords
stopw = set(stopwords.words('english'))

In [12]:
# Load dataset:
jd_df = pd.read_csv(r'D:/ML_Projects/Job_Reccomendation_System/src/data/jd_structured_data.csv')
jd_df.head()

Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Competitors,Average Salary,Average Revenue,Processed_JD
0,Motion Graphics Designer,3.6,Astra International,Tegal,Jakarta,5001 to 10000 employees,2013,Company - Joint Venture,Automotive,Transportation,"Garuda Indonesia, Blue Bird Group",1569,29187,Join us as a Motion Graphics Designer. Require...
1,Agile Project Manager,3.9,Pertamina,Banjarbaru,Bandung,10001+ employees,1998,Company - Public,Oil and Gas,Energy,"PLN (Perusahaan Listrik Negara), Bumi Resources",8159,96970,We need an Agile Project Manager with skills i...
2,UX Designer,3.1,Sinar Mas Group,Dumai,Medan,1001 to 5000 employees,1998,Company - Cooperative,Conglomerate,Conglomerate,Lippo Group,7219,84394,"We need a UX Designer with skills in Adobe XD,..."
3,Motion Graphics Designer,3.3,Lippo Group,Balikpapan,Jakarta,5001 to 10000 employees,1992,Company - Subsidiary,Conglomerate,Conglomerate,Sinar Mas Group,7474,61052,Join us as a Motion Graphics Designer. Require...
4,Research Scientist,2.9,Djarum,Tangerang Selatan,Medan,1001 to 5000 employees,2008,Company - Limited Liability Partnership (LLP),Tobacco,Consumer Goods,"Sampoerna, Indofood, Gudang Garam",8218,99647,Join us as a Research Scientist. Required skil...


In [13]:
# Global variable to hold skills
skills = []

In [14]:
def set_skills(resume_skills):
    global skills
    if resume_skills:
        skills = [' '.join(word for word in resume_skills if word.lower() not in stopw)]
    else:
        skills = []

# Feature Engineering

In [15]:
def ngrams(string, n=3):
    string = fix_text(string)  # fix text
    string = string.encode("ascii", errors="ignore").decode()  # remove non-ascii chars
    string = string.lower()
    chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()  # normalize case - capitalize the start of each word
    string = re.sub(' +', ' ', string).strip()  # get rid of multiple spaces and replace with a single
    string = ' ' + string + ' '  # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [16]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)

# Job Recommender

In [17]:
def getNearestN(queryTFIDF_, nbrs):
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

In [18]:
def recommend_jobs(user_location):
    global skills
    
    if not skills or all(word in stopw for word in skills):
        raise ValueError("No valid skills extracted. Please check the resume content.")
    
    # Normalize the user location input for comparison
    normalized_user_location = user_location.strip().lower()
    
    # Create a single document with skills for TF-IDF vectorization
    skills_doc = [' '.join(skills)]
    tfidf = vectorizer.fit_transform(skills_doc)
    
    # Filter jobs by location, normalize the location data for comparison
    filtered_jobs = jd_df[jd_df['Location'].str.strip().str.lower().str.contains(normalized_user_location, case=False, na=False)]
    if filtered_jobs.empty:
        return pd.DataFrame()
    
    # Transform job descriptions
    jd_test = filtered_jobs['Processed_JD'].values.astype('U')
    jd_tfidf = vectorizer.transform(jd_test)
    
    # Fit NearestNeighbors on job descriptions
    nbrs = NearestNeighbors(n_neighbors=min(len(filtered_jobs), 5), n_jobs=-1).fit(jd_tfidf)
    
    # Find nearest neighbors to the skills vector
    distances, indices = getNearestN(tfidf, nbrs)
    
    matches = []
    for i in range(len(indices[0])):
        match_info = {
            'Match confidence': round(distances[0][i], 2),
            'Job Title': filtered_jobs.iloc[indices[0][i]]['Job Title'],
            'Company Name': filtered_jobs.iloc[indices[0][i]]['Company Name'],
            'Location': filtered_jobs.iloc[indices[0][i]]['Location'],
            'Industry': filtered_jobs.iloc[indices[0][i]]['Industry'],
            'Sector': filtered_jobs.iloc[indices[0][i]]['Sector'],
            'Average Salary': filtered_jobs.iloc[indices[0][i]]['Average Salary']
        }
        matches.append(match_info)
    
    matches_df = pd.DataFrame(matches)
    
    return matches_df.drop_duplicates().sort_values('Match confidence').head(10)