<a href="https://colab.research.google.com/github/AroopGit/Interactly-Task-Aroop-Rath/blob/main/Candidate_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pymongo transformers datasets torch faiss-cpu scikit-learn

import pandas as pd
import numpy as np
from pymongo import MongoClient
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch



In [2]:
df=pd.read_csv('data.csv')

In [3]:
import sqlite3

In [4]:
conn = sqlite3.connect('candidates.db')
cursor = conn.cursor()

In [5]:
def fetch_all_candidates():
    cursor.execute("SELECT * FROM candidates")
    columns = [description[0] for description in cursor.description]
    return [dict(zip(columns, row)) for row in cursor.fetchall()]

In [6]:
all_candidates = fetch_all_candidates()
print(f"Loaded {len(all_candidates)} candidates from the database.")

Loaded 120 candidates from the database.


In [7]:
def preprocess_candidate(candidate):
    return f"{candidate['Name']} {candidate['JobSkills']} {candidate['Experience']} {candidate['Projects']} {candidate['Comments']}"

preprocessed_data = [preprocess_candidate(candidate) for candidate in all_candidates]

In [8]:

vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(preprocessed_data)

dense_vectors = tfidf_vectors.toarray().astype('float32')


In [9]:

dimension = dense_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(dense_vectors)



In [10]:
faiss.write_index(index, "candidate_index.bin")

In [11]:

def prepare_data(candidate):
    return {
        "text": preprocess_candidate(candidate),
        "label": 1
    }

train_data = [prepare_data(candidate) for candidate in all_candidates]

In [14]:
!pip install datasets
from datasets import Dataset

dataset = Dataset.from_dict({
    "text": [item["text"] for item in train_data],
    "label": [item["label"] for item in train_data]
})



In [15]:

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [19]:

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [21]:
trainer.train()

Step,Training Loss


Step,Training Loss


TrainOutput(global_step=45, training_loss=0.13548045688205296, metrics={'train_runtime': 1392.395, 'train_samples_per_second': 0.259, 'train_steps_per_second': 0.032, 'total_flos': 47688263516160.0, 'train_loss': 0.13548045688205296, 'epoch': 3.0})

In [22]:
trainer.save_model("fine_tuned_candidate_model")

In [23]:

model_save_path = "fine_tuned_candidate_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

Model and tokenizer saved to fine_tuned_candidate_model


In [24]:

class RAGFramework:
    def __init__(self):

        model_dir = "./fine_tuned_candidate_model"
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_dir)

        model_dir = "fine_tuned_candidate_model"
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_dir)


        self.conn = sqlite3.connect('candidates.db')
        self.cursor = self.conn.cursor()


        self.vectorizer = TfidfVectorizer()
        all_candidates = fetch_all_candidates()
        preprocessed_data = [self.preprocess_candidate(candidate) for candidate in all_candidates]
        self.vectorizer.fit(preprocessed_data)

    def preprocess_candidate(self, candidate):
        return f"{candidate['Name']} {candidate['JobSkills']} {candidate['Experience']} {candidate['Projects']} {candidate['Comments']}"

    def search_candidates(self, job_description, top_k=5):

        job_vector = self.vectorizer.transform([job_description]).toarray().astype('float32')

        _, candidate_indices = self.index.search(job_vector, top_k)

        all_candidates = fetch_all_candidates()
        matching_candidates = [all_candidates[i] for i in candidate_indices[0]]


        refined_candidates = self.refine_with_llm(job_description, matching_candidates)

        return refined_candidates

    def refine_with_llm(self, job_description, candidates):
        refined_candidates = []
        for candidate in candidates:
            input_text = f"Job: {job_description}\nCandidate: {self.preprocess_candidate(candidate)}"
            inputs = self.tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
            outputs = self.model(**inputs)
            score = outputs.logits.softmax(dim=1)[0][1].item()
            if score > 0.5:
                refined_candidates.append((candidate, score))

        return sorted(refined_candidates, key=lambda x: x[1], reverse=True)


In [26]:
import sqlite3
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def fetch_all_candidates():
    conn = sqlite3.connect('candidates.db')
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM candidates")
    columns = [description[0] for description in cursor.description]
    candidates = [dict(zip(columns, row)) for row in cursor.fetchall()]
    conn.close()
    return candidates

def preprocess_candidate(candidate):
    skills = candidate.get('JobSkills', '')
    return f"{candidate.get('Name', '')} {skills} {candidate.get('Experience', '')} {candidate.get('Projects', '')} {candidate.get('Comments', '')}".strip()

def preprocess_job_description(job_description):
    job_description = re.sub(r'\W+', ' ', job_description)
    return job_description.lower().strip()

class RAGFramework:
    def __init__(self, all_candidates):
        self.all_candidates = all_candidates
        preprocessed_data = [preprocess_candidate(candidate) for candidate in all_candidates]
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_vectors = vectorizer.fit_transform(preprocessed_data)
        dense_vectors = tfidf_vectors.toarray().astype('float32')

        dimension = dense_vectors.shape[1]
        self.index = faiss.IndexFlatIP(dimension)
        faiss.normalize_L2(dense_vectors)
        self.index.add(dense_vectors)
        self.vectorizer = vectorizer

    def search_candidates(self, job_description, num_candidates=10):
        preprocessed_description = preprocess_job_description(job_description)
        job_vector = self.vectorizer.transform([preprocessed_description]).toarray().astype('float32')
        faiss.normalize_L2(job_vector)
        _, candidate_indices = self.index.search(job_vector, len(self.all_candidates))

        matching_candidates = [self.all_candidates[i] for i in candidate_indices[0]]

        filtered_candidates = []
        for candidate in matching_candidates:
            candidate_skills = set((candidate.get('JobSkills', '')).lower().replace(',', '').split())
            job_skills = set(preprocessed_description.split())
            similarity_score = cosine_similarity(
                self.vectorizer.transform([" ".join(candidate_skills)]).toarray(),
                self.vectorizer.transform([" ".join(job_skills)]).toarray()
            )[0][0]
            if similarity_score > 0.1:
                filtered_candidates.append(candidate)
            if len(filtered_candidates) >= num_candidates:
                break

        if len(filtered_candidates) < num_candidates:
            filtered_candidates = []
            for candidate in matching_candidates:
                candidate_skills = set((candidate.get('JobSkills', '')).lower().replace(',', '').split())
                job_skills = set(preprocessed_description.split())
                similarity_score = cosine_similarity(
                    self.vectorizer.transform([" ".join(candidate_skills)]).toarray(),
                    self.vectorizer.transform([" ".join(job_skills)]).toarray()
                )[0][0]
                if similarity_score > 0.05:
                    filtered_candidates.append(candidate)
                if len(filtered_candidates) >= num_candidates:
                    break

        return filtered_candidates

all_candidates = fetch_all_candidates()
rag_framework = RAGFramework(all_candidates)

def search_candidates(job_description):
    matching_candidates = rag_framework.search_candidates(job_description)

    print(f"\nMatching Candidates for: {job_description}")
    print(f"Found {len(matching_candidates)} matching candidates:")

    if not matching_candidates:
        print("No matching candidates found.")
    else:
        for i, candidate in enumerate(matching_candidates, 1):
            print(f"\n{i}. Name: {candidate.get('Name', 'N/A')}")
            print(f"   Contact Details: {candidate.get('ContactDetails', 'N/A')}")
            print(f"   Location: {candidate.get('Location', 'N/A')}")
            skills = candidate.get('JobSkills', 'N/A')
            print(f"   Job Skills: {skills}")
            print(f"   Experience: {candidate.get('Experience', 'N/A')}")
            print(f"   Projects: {candidate.get('Projects', 'N/A')}")
            print(f"   Comments: {candidate.get('Comments', 'N/A')}")

job_description = input("Enter job description (e.g., 'React developer'): ")
search_candidates(job_description)


Enter job description (e.g., 'React developer'): Pick up the top 10 profiles for the following job description, We are looking for a skilled UI Developer to join our dynamic team. The ideal candidate will have a strong background in front-end development, with proficiency in HTML, CSS, JavaScript, and modern frameworks like React or Angular. Your primary responsibility will be to create visually appealing and user-friendly web interfaces that enhance user experience and align with our brand guidelines

Matching Candidates for: Pick up the top 10 profiles for the following job description, We are looking for a skilled UI Developer to join our dynamic team. The ideal candidate will have a strong background in front-end development, with proficiency in HTML, CSS, JavaScript, and modern frameworks like React or Angular. Your primary responsibility will be to create visually appealing and user-friendly web interfaces that enhance user experience and align with our brand guidelines
Found 10 