In [None]:
!pip install PyPDF2
!pip install transformers

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
from transformers import BertTokenizer, BertModel
import torch
import numpy as np


In [None]:
job_listings = pd.read_csv('job_details_df.csv')


In [None]:
job_listings.head(1)

Unnamed: 0.1,Unnamed: 0,Job Role,Company,Job_Description,id
0,0,ai ml engineer,accenture in india,project role ai ml engineer\n\nproject role ...,0


In [None]:
job_listings.isnull().sum()

Unnamed: 0         0
Job Role           0
Company            0
Job_Description    0
id                 0
dtype: int64

In [None]:
job_listings.dropna(inplace=True)

In [None]:
job_listings.columns

Index(['Unnamed: 0', 'Job Role', 'Company', 'Job_Description', 'id'], dtype='object')

In [None]:
# Convert all string columns to lowercase and preprocess text
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

In [None]:
job_listings['combined_text'] = (job_listings['Job Role'] + ' ' +
                                 job_listings['Job_Description'] + ' ' +
                                 job_listings['Company']).apply(preprocess_text)


In [None]:
# TF-IDF Vectorizer with parameter tuning
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.85, min_df=2, ngram_range=(1, 2))
job_vectors = vectorizer.fit_transform(job_listings['combined_text'])

In [None]:
import joblib

In [None]:

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Save the job_vectors (sparse matrix)
joblib.dump(job_vectors, 'job_vectors.joblib')

['job_vectors.joblib']

In [None]:
# BERT model and tokenizer for semantic similarity
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return preprocess_text(text)

In [None]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [None]:
# Compute BERT embeddings for job listings once and store them
job_embeddings = np.array([get_bert_embedding(text) for text in job_listings['combined_text']])


In [None]:
# Save the embeddings to a .npy file
np.save('job_embeddings.npy', job_embeddings)

In [None]:
# Function to get recommendations based on user input
def get_recommendations(user_input=None, pdf_path=None, top_n=5):
    if pdf_path:
        resume_text = extract_text_from_pdf(pdf_path)
    elif user_input:
        resume_text = preprocess_text(user_input)
    else:
        raise ValueError("Either user_input or pdf_path must be provided")

    # TF-IDF similarity
    user_vector = vectorizer.transform([resume_text])
    tfidf_similarity_scores = cosine_similarity(user_vector, job_vectors).flatten()

    # BERT similarity
    user_embedding = get_bert_embedding(resume_text)
    bert_similarity_scores = cosine_similarity([user_embedding], job_embeddings).flatten()

    # Combine similarity scores (weighted average)
    combined_scores = 0.5 * tfidf_similarity_scores + 0.5 * bert_similarity_scores

    similarity_scores = list(enumerate(combined_scores))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_jobs = [job_listings.iloc[i[0]] for i in similarity_scores[:top_n]]
    return top_jobs


In [None]:
# Example usage: Get top 5 job recommendations
# pdf_path = '/content/resume_amit.pdf'  # Provide the path to the PDF file if available
#user_input = "Enter your job role, skills, and career goals"  # Provide text input if available
user_input = input('Enter Job role , Skill , Experiance')
# Uncomment one of the lines below depending on the input method
# recommended_jobs = get_recommendations(user_input=user_input, top_n=5)
recommended_jobs = get_recommendations(user_input=user_input, top_n=5)

# Output the recommendations
for job in recommended_jobs:
    print(f"\nCompany Name: {job['Company']}")
    print(f"Job Title: {job['Job Role']}")
    print(f"Job Description: {job['Job_Description']}")

Enter Job role , Skill , ExperianceMachine Learning 

Company Name: vitrana
Job Title: machine learning engineer
Job Description: responsibilities

study and transform data science prototypesresearch and implement appropriate ml algorithms and toolsrun machine learning tests and experimentsperform statistical analysis and finetuning using test resultsextend existing ml libraries and frameworksselect appropriate datasets and data representation methods  requirementsbsc in computer science mathematics or similar field masters degree is a plus24 yrs experience as a machine learning engineer or similar roledeep knowledge of math probability statistics and algorithmsfamiliarity with machine learning frameworks like keras or tensorflow and libraries like scikitlearnability to write robust code in python

Company Name: fusion plus solutions inc
Job Title: machine learning python
Job Description: no of years experience 58 years detailed job description  skill set  programmingscripting  python 