### Importing Modules

In [1]:
import json
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Function to load JSON file
def load_json(i):
    with open(f"./str_data_resumes/{i}.json") as f:
        data = json.load(f)
    return data


In [36]:
# Function to extract work descriptions from the resume JSON data
def extract_work_descriptions(data):
    work_descriptions = []
    # Check if job_history exists in the data
    if "details" in data and "job_history" in data["details"]:
        for job in data["details"]["job_history"]:
            if "work_description" in job:
                work_descriptions.append(job["work_description"])
    return work_descriptions

In [2]:
# Initialize sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')

# Function to get embeddings for a list of work descriptions
def get_embedding(text):
    # Generate embedding for the given text
    embedding = model.encode(text)
    return embedding

# Store all embeddings in a dictionary
embeddings_dict = {}

### Creating Embeddings

In [None]:
# File path to store embeddings incrementally
embeddings_file = "embeddings.jsonl"

# Load existing embeddings if the file already exists (resume functionality)
processed_files = set()
if os.path.exists(embeddings_file):
    with open(embeddings_file, "r") as infile:
        for line in infile:
            # Read and parse each line as JSON
            record = json.loads(line)
            processed_files.add(record['file_index'])

# Start from the last processed file
start_index = len(processed_files)

# Loop through files 0.json to 999.json and show progress bar
with open(embeddings_file, "a") as outfile:  # Open in append mode
    for i in tqdm(range(start_index, 1000), initial=start_index, total=1000, desc="Processing Resumes"):
        try:
            if i in processed_files:
                continue  # Skip already processed files
            
            # Load the resume data
            resume = load_json(i)
            # Extract work descriptions
            work_descriptions_list = extract_work_descriptions(resume)
            
            # Generate embeddings for each work description
            for desc_index, desc in enumerate(work_descriptions_list):
                embedding = get_embedding(desc)
                
                # Prepare the JSON object to write
                record = {
                    "file_index": i,
                    "desc_index": desc_index,
                    "work_description": desc,
                    "embedding": embedding.tolist()  # Convert embedding to list for JSON
                }
                
                # Write the JSON object to the JSONL file
                outfile.write(json.dumps(record) + "\n")
            
        except Exception as e:
            print(f"Error processing file {i}.json: {e}")

print(f"Embeddings saved to {embeddings_file}.")

### Testing Semantic Search

In [40]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import heapq

# Initialize the model
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')

# Function to load all embeddings from the JSONL file
def load_embeddings(file_path):
    embeddings = []
    with open(file_path, "r") as infile:
        for line in infile:
            data = json.loads(line)
            embeddings.append({
                "file_index": data["file_index"],
                "desc_index": data["desc_index"],
                "work_description": data["work_description"],
                "embedding": np.array(data["embedding"])  # Convert list back to NumPy array
            })
    return embeddings

# Load all embeddings
embeddings_file = "embeddings.jsonl"
embeddings_data = load_embeddings(embeddings_file)

# Function to compute cosine similarity between query and all embeddings
def query_system(user_query, top_n=100):
    # Convert user query to embedding
    query_embedding = model.encode(user_query)
    
    # List to store cosine similarities
    similarities = []
    
    # Compute cosine similarity with each embedding
    for i, entry in enumerate(embeddings_data):
        similarity = cosine_similarity(
            query_embedding.reshape(1, -1), 
            entry["embedding"].reshape(1, -1)
        )[0][0]  # Flatten similarity value
        similarities.append((similarity, entry))
    
    # Find top N results with the highest cosine similarity
    top_results = heapq.nlargest(top_n, similarities, key=lambda x: x[0])
    
    # Count occurrences of each file_index in top results
    file_index_counter = Counter([entry["file_index"] for _, entry in top_results])
    
    # Sort file indices by frequency
    sorted_file_indices = file_index_counter.most_common()
    
    # Display top file indices and corresponding work descriptions
    print("\nTop matching file indices:")
    for file_index, count in sorted_file_indices:
        print(f"\nFile index {file_index} appeared {count} times.")
        # Display all work descriptions related to this file_index in the top results
        for _, entry in top_results:
            if entry["file_index"] == file_index:
                print(f"- Description: {entry['work_description']}")
    
    return sorted_file_indices

In [None]:
# Example usage:
user_query = "people having software development experience"
top_results = query_system(user_query)

top_results

### Hybrid Search

In [49]:
import json
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import heapq
import nltk

In [None]:
nltk.download('punkt')

# Initialize sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')

In [51]:

# Load all embeddings from the JSONL file
def load_embeddings(file_path):
    embeddings = []
    with open(file_path, "r") as infile:
        for line in infile:
            data = json.loads(line)
            embeddings.append({
                "file_index": data["file_index"],
                "desc_index": data["desc_index"],
                "work_description": data["work_description"],
                "embedding": np.array(data["embedding"])  # Convert list back to NumPy array
            })
    return embeddings

In [52]:

# Load BM25 corpus (text descriptions)
def load_bm25_corpus(embeddings_data):
    corpus = []
    for entry in embeddings_data:
        tokens = nltk.word_tokenize(entry['work_description'].lower())
        corpus.append(tokens)
    return corpus

In [53]:
# BM25 search function
def bm25_search(bm25, query, corpus, top_n=100):
    query_tokens = nltk.word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[-top_n:][::-1]  # Get top N indices (descending)
    return [(corpus[i], scores[i], i) for i in top_indices]


In [54]:
# Cosine similarity search function
def cosine_similarity_search(user_query_embedding, embeddings_data, top_n=100):
    similarities = []
    
    for i, entry in enumerate(embeddings_data):
        similarity = cosine_similarity(
            user_query_embedding.reshape(1, -1), 
            entry["embedding"].reshape(1, -1)
        )[0][0]  # Flatten similarity value
        similarities.append((similarity, entry))
    
    # Find top N results with the highest cosine similarity
    top_results = heapq.nlargest(top_n, similarities, key=lambda x: x[0])
    
    return top_results

In [55]:
# Function to normalize scores between 0 and 1
def normalize(scores):
    min_val = min(scores)
    max_val = max(scores)
    return [(score - min_val) / (max_val - min_val) for score in scores]

In [70]:
def hybrid_search(user_query, embeddings_data, corpus, bm25, top_n=100, bm25_weight=0.4, cosine_weight=0.6):
    # Generate embedding for user query
    query_embedding = model.encode(user_query)
    
    # Perform BM25 search
    bm25_results = bm25_search(bm25, user_query, corpus, top_n)
    bm25_scores = [score for _, score, _ in bm25_results]
    normalized_bm25_scores = normalize(bm25_scores)
    
    # Perform cosine similarity search
    cosine_results = cosine_similarity_search(query_embedding, embeddings_data, top_n)
    cosine_scores = [similarity for similarity, _ in cosine_results]
    normalized_cosine_scores = normalize(cosine_scores)
    
    # Combine BM25 and Cosine Similarity scores
    combined_results = []
    for i in range(top_n):
        bm25_score = normalized_bm25_scores[i]
        cosine_score = normalized_cosine_scores[i]
        combined_score = bm25_weight * bm25_score + cosine_weight * cosine_score
        
        # Find the corresponding entry in embeddings_data using the index from BM25 results
        entry = embeddings_data[bm25_results[i][2]]  # Use index from BM25 results
        combined_results.append((combined_score, cosine_scores[i], entry))  # Store combined score and original cosine score
    
    # Sort combined results by cosine similarity score (second item in tuple) in descending order
    combined_results.sort(key=lambda x: x[1], reverse=True)
    
    # Display top results based on maximum cosine similarity score
    print("\nTop matching results based on Cosine Similarity:")
    for score, cosine_sim, entry in combined_results:
        print(f"\nCosine Similarity: {cosine_sim:.4f}, Combined Score: {score:.4f}")
        print(f"File Index: {entry['file_index']}")
        print(f"- Description: {entry['work_description']}")
    
    return combined_results  # Return combined results for further processing if needed

In [None]:
# Load embeddings data and prepare BM25 corpus
embeddings_file = "embeddings.jsonl"
embeddings_data = load_embeddings(embeddings_file)
corpus = load_bm25_corpus(embeddings_data)
    
# Initialize BM25 with the corpus
bm25 = BM25Okapi(corpus)
    
# User query
user_query = "Find me a professor"
    
# Perform hybrid search
hybrid_results = hybrid_search(user_query, embeddings_data, corpus, bm25, top_n=100)