In [1]:
import sqlite3
import pandas as pd

# Loading CSV data
df = pd.read_csv(r'/content/candidates.csv')

# Renaming columns to match the SQLite schema
df.columns = ['Name', 'Contact_Details', 'Location', 'Job_Skills', 'Experience', 'Projects', 'Comments']

# Connecting to SQLite database
conn = sqlite3.connect('candidates.db')
c = conn.cursor()

# We drop the existing table if it exists`
c.execute('DROP TABLE IF EXISTS Candidates')

# Create a table with the updated schema
c.execute('''CREATE TABLE IF NOT EXISTS Candidates (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
             Name TEXT,
             Contact_Details TEXT,
             Location TEXT,
             Job_Skills TEXT,
             Experience TEXT,
             Projects TEXT,
             Comments TEXT
             )''')
# Insert our data into the table
df.to_sql('Candidates', conn, if_exists='append', index=False)

conn.commit()
conn.close()

In [2]:
!pip install faiss-cpu
!pip install sentence_transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0
Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


In [3]:
import faiss
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [4]:
# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Connect to SQLite database and fetch candidate data
conn = sqlite3.connect('candidates.db')
df = pd.read_sql_query("SELECT * FROM Candidates", conn)
conn.close()

# Combine all text fields to create a single text representation for each candidate
df['combined_text'] = df[['Name', 'Contact_Details', 'Location', 'Job_Skills', 'Experience', 'Projects', 'Comments']].astype(str).agg(' '.join, axis=1)

# Generate embeddings for each candidate
candidate_embeddings = model.encode(df['combined_text'].tolist())

# Create a FAISS index
dimension = candidate_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(candidate_embeddings)

# Save the FAISS index and candidate IDs
faiss.write_index(index, 'candidates.index')
df[['id']].to_csv('candidate_ids.csv', index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Load the pre-trained model and FAISS index
model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.read_index('candidates.index')
candidate_ids = pd.read_csv('candidate_ids.csv')

def search_candidates(job_description, top_k=5):
    # Generate embedding for the job description
    job_embedding = model.encode([job_description])

    # Search the FAISS index
    distances, indices = index.search(job_embedding, top_k)

    # Get the matching candidate IDs
    matching_ids = candidate_ids.iloc[indices[0]]['id'].values

    return matching_ids

# Example usage
job_description = "Looking for a Machine Learning expert who knows Java."
matching_ids = search_candidates(job_description)
print("Matching candidate IDs:", matching_ids)



Matching candidate IDs: [ 43  35 116  23 107]


In [6]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

In [7]:
import pandas as pd

# Load Alpha dataset
alpha_df = pd.read_csv(r'/content/Resume.csv')



# Display the first few rows of the dataset
alpha_df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [8]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load Alpha dataset
alpha_df = pd.read_csv(r'/content/Resume.csv')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [9]:
# Function to get embeddings
def get_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Get embeddings for all resumes
embeddings = []
for resume in alpha_df['Resume_str']:
    embeddings.append(get_embeddings(resume, tokenizer, model))

# Convert embeddings list to numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a file for later use
np.save('resume_embeddings.npy', embeddings)

print("Embeddings shape:", embeddings.shape)

Embeddings shape: (2484, 768)


In [10]:
import faiss

# Load the embeddings
embeddings = np.load('resume_embeddings.npy')

# Create FAISS index
d = embeddings.shape[1]  # dimension of embeddings
index = faiss.IndexFlatL2(d)

# Add embeddings to the index
index.add(embeddings)

# Save the index to disk
faiss.write_index(index, 'alpha_index.faiss')

print("Index created and saved successfully.")

Index created and saved successfully.


In [11]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load Alpha dataset
alpha_df = pd.read_csv(r'/content/Resume.csv')

# Prepare data for training
# For simplicity, let's assume we are doing binary classification on the 'Category' column
alpha_df = alpha_df[['Resume_str', 'Category']]
alpha_df['labels'] = alpha_df['Category'].factorize()[0]  # Convert categories to numerical labels

# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    alpha_df['Resume_str'].tolist(), alpha_df['labels'].tolist(), test_size=0.2
)

# Load pre-trained tokenizer and model
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(alpha_df['labels'].unique()))

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert to PyTorch dataset
class ResumeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ResumeDataset(train_encodings, train_labels)
val_dataset = ResumeDataset(val_encodings, val_labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine-tuned-distilbert')
tokenizer.save_pretrained('./fine-tuned-distilbert')

print("Fine-tuning completed and model saved.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,3.1755
20,3.1855
30,3.1633
40,3.1872
50,3.1785
60,3.171
70,3.1669
80,3.1492
90,3.1324
100,3.1182


Fine-tuning completed and model saved.


In [12]:
import faiss
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the fine-tuned model and tokenizer
model_path = './fine-tuned-distilbert'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, output_hidden_states=True)

# Load the FAISS index and the associated embeddings
index = faiss.read_index("alpha_index.faiss")
embeddings = np.load("resume_embeddings.npy")
alpha_df = pd.read_csv(r'/content/Resume.csv')

# Check the dimensions of the index
assert embeddings.shape[1] == index.d, f"Embeddings dimension {embeddings.shape[1]} does not match index dimension {index.d}"

# Function to get embeddings for a query using AutoModelForSequenceClassification
def get_query_embedding(query, tokenizer, model):
    inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    hidden_states = outputs.hidden_states[-1]  # Get the hidden states from the last layer
    return hidden_states.mean(dim=1).detach().numpy()

# Function to search FAISS index
def search_faiss_index(query, k=5):
    query_embedding = get_query_embedding(query, tokenizer, model)
    assert query_embedding.shape[1] == index.d, f"Query embedding dimension {query_embedding.shape[1]} does not match index dimension {index.d}"
    distances, indices = index.search(query_embedding, k)
    results = alpha_df.iloc[indices[0]]
    return results

# Function to predict categories using the fine-tuned model
def predict_categories(resumes, tokenizer, model):
    categories = []
    for resume in resumes:
        inputs = tokenizer(resume, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        logits = outputs.logits.detach().numpy()
        predicted_label = np.argmax(logits, axis=1)[0]
        categories.append(predicted_label)
    return categories

# Example usage
query = "I am looking for a data scientist position in healthcare"
relevant_resumes = search_faiss_index(query)
predicted_categories = predict_categories(relevant_resumes['Resume_str'], tokenizer, model)
print(predicted_categories)

[6, 20, 22, 6, 11]


In [13]:
import faiss
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the fine-tuned model and tokenizer
model_path = './fine-tuned-distilbert'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load the FAISS index and the associated embeddings
index = faiss.read_index("alpha_index.faiss")
embeddings = np.load("resume_embeddings.npy")
alpha_df = pd.read_csv(r'/content/Resume.csv')

# Function to get embeddings for a query
def get_query_embedding(query, tokenizer, model):
    inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model.base_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Function to search FAISS index
def search_faiss_index(query, k=5):
    query_embedding = get_query_embedding(query, tokenizer, model)
    distances, indices = index.search(query_embedding, k)
    results = alpha_df.iloc[indices[0]]
    return results

# Command-line interface for querying
def command_line_interface():
    while True:
        query = input("Enter your query (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        relevant_resumes = search_faiss_index(query)
        print("Relevant resumes:")
        for idx, row in relevant_resumes.iterrows():
            print(f"Resume ID: {row['ID']}, Category: {row['Category']}")
            print(f"Resume Text: {row['Resume_str'][:500]}...")  # Print the first 500 characters of the resume
            print("="*80)

# Run the command-line interface
command_line_interface()

Enter your query (or type 'exit' to quit): I am looking for a data scientist position in healthcare
Relevant resumes:
Resume ID: 23944036, Category: HEALTHCARE
Resume Text:          HEALTHCARE CLINICAL ANALYST           Professional Experience     November 2012   to   September 2015     Company Name    City  ,   State    Healthcare Clinical Analyst         I entered the insurance arena with no insurance experience and I now successfully serve as a leader in the appeals deparement. I  work all lines of business, including commercial, state group and Medicare appeals. In this highly collaborative area, I work  in conjunction with medical directors, medical policy, ph...
Resume ID: 13727873, Category: PUBLIC-RELATIONS
Resume Text:          MARKET ANALYST PROMOTED TO ASSISTANT DIRECTOR OF BRAND STRATEGY       Professional Summary    Dear Cristina and team,
The second I found out about this position, my I found myself extremely excited. I knew right away this is something I have to be a par