#### Load libraries

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from sentence_transformers import CrossEncoder
from pprint import pprint
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch



#### Load dataset

In [3]:
file_path = "/mnt/c/Users/USER/Documents/Potential_Talent/potential-talents - Aspiring human resources - seeking human resources.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


#### Drop fit column

In [4]:
df = df.drop(columns = ['fit'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


#### check for semantic duplicated rows

In [5]:
# Check for semantic duplicates based on 'title' and 'location'
duplicated_rows = df[df.duplicated(subset=['job_title', 'location', 'connection'], keep=False)]

if not duplicated_rows.empty:
    print(f"Number of duplicated rows: {len(duplicated_rows)}")
    print("Duplicated rows are:")
    print(duplicated_rows)
else:
    print("There is no duplicate rows.")

Number of duplicated rows: 65
Duplicated rows are:
    id                                          job_title  \
0    1  2019 C.T. Bauer College of Business Graduate (...   
1    2  Native English Teacher at EPIK (English Progra...   
2    3              Aspiring Human Resources Professional   
3    4             People Development Coordinator at Ryan   
4    5    Advisory Board Member at Celal Bayar University   
..  ..                                                ...   
60  61                               HR Senior Specialist   
61  62  Seeking Human Resources HRIS and Generalist Po...   
62  63                      Student at Chapman University   
63  64  SVP, CHRO, Marketing & Communications, CSR Off...   
64  65  Human Resources Coordinator at InterContinenta...   

                               location connection  
0                        Houston, Texas         85  
1                                Kanada      500+   
2   Raleigh-Durham, North Carolina Area         44  
3   

#### Drop duplicated rows

In [6]:
# Drop semantic duplicates
df = df.drop_duplicates(subset=['job_title', 'location', 'connection'], keep='first')

# Optional: reset index
#df.reset_index(drop=True, inplace=True)

# Check that duplicates are gone
duplicated_rows_after = df[df.duplicated(subset=['job_title', 'location', 'connection'], keep=False)]
if duplicated_rows_after.empty:
    print("All semantic duplicates have been removed.")
else:
    print(f"Duplicates still remaining: {len(duplicated_rows_after)}")

All semantic duplicates have been removed.


#### create text column

In [7]:
# Create a column for embedding text
df['text'] = df['job_title'] + ". Location: " + df['location'] + ". connection: " + df['connection']
df.head()

Unnamed: 0,id,job_title,location,connection,text
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,2019 C.T. Bauer College of Business Graduate (...
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,Native English Teacher at EPIK (English Progra...
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,Aspiring Human Resources Professional. Locatio...
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,People Development Coordinator at Ryan. Locati...
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,Advisory Board Member at Celal Bayar Universit...


#### clean the text column

In [8]:
df["text"] = (
    df["text"]
    .astype(str)
    .str.lower()
    .str.replace(r"[\r\n\t]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)
df[["text"]].head()

Unnamed: 0,text
0,2019 c.t. bauer college of business graduate (...
1,native english teacher at epik (english progra...
2,aspiring human resources professional. locatio...
3,people development coordinator at ryan. locati...
4,advisory board member at celal bayar universit...


#### load sentence embedding model

In [9]:

# Load a small, fast embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("model loaded")

model loaded


#### Embed the text column

In [10]:
# Generate embeddings
import numpy as np
text_embeddings = embedder.encode(
    df['text'].tolist(),
    convert_to_numpy=True
)
# Ensure FAISS-compatible dtype
text_embeddings = text_embeddings.astype("float32")

print("text_embeddings generated")

text_embeddings generated


#### normalize the embeddings

In [11]:
# Normalize embeddings for cosine similarity
text_embeddings /= np.linalg.norm(text_embeddings, axis=1, keepdims=True)
print("text_embeddings normalized for cosine similarity")

text_embeddings normalized for cosine similarity


#### Build and store faiss index

In [16]:
# FAISS already requires float32, embeddings are normalized
text_dimension = text_embeddings.shape[1]

# Use IndexFlatIP for cosine similarity
index_dimension = faiss.IndexFlatIP(text_dimension)
index_dimension.add(text_embeddings)

print("Number of vectors in FAISS index:", index_dimension.ntotal)

Number of vectors in FAISS index: 53


#### Retrieval function (Retriever)

In [23]:
def retrieve_query(query, faiss_index, dataset, embedder, top_k=5, return_structured=False):
    """
    query: string, user query
    faiss_index: FAISS index of the dataset
    dataset: original dataframe with 'text' and 'id' columns
    embedder: SentenceTransformer embedding model
    top_k: number of top retrievals
    return_structured: if True, return list of dicts instead of formatted string
    """
    # Embed and normalize query
    query_emb = embedder.encode([query], convert_to_numpy=True).astype('float32')
    query_emb /= np.linalg.norm(query_emb, axis=1, keepdims=True)

    # Retrieve top-k entries from FAISS
    distances, indices = faiss_index.search(query_emb, top_k)

    # Get top results with id, text, and score
    top_results = [
        {
            "id": dataset.iloc[i]['id'],
            "text": dataset.iloc[i]['text'],
            "score": round(float(distances[0][j]), 4)
        }
        for j, i in enumerate(indices[0])
    ]

    if return_structured:
        return top_results

    # Build single response string
    response = "You are an assistant. Using the following candidate profiles:\n"
    for i, doc in enumerate(top_results, 1):
        response += f"{i}. {doc}\n"
    response += f"\nAnswer the query: \"{query}\""

    return response

#### call the function

In [26]:
query = "Which candidates are most suitable for Aspiring human resources” or “seeking human resources in new york city area?"
# Formatted string output
response = retrieve_query(query, index_dimension, df, embedder, top_k=5)
print("\nGenerated response:\n")
print(response)

# Structured output for reranking
from pprint import pprint
faiss_top_candidates = retrieve_query(query, index_dimension, df, embedder, top_k=5, return_structured=True)
print("\nStructured top candidates for reranking:\n")
pprint(faiss_top_candidates)


Generated response:

You are an assistant. Using the following candidate profiles:
1. {'id': 6, 'text': 'aspiring human resources specialist. location: greater new york city area. connection: 1', 'score': 0.7901}
2. {'id': 76, 'text': 'aspiring human resources professional | passionate about helping to create an inclusive and engaging work environment. location: new york, new york. connection: 212', 'score': 0.689}
3. {'id': 28, 'text': 'seeking human resources opportunities. location: chicago, illinois. connection: 390', 'score': 0.6639}
4. {'id': 10, 'text': 'seeking human resources hris and generalist positions. location: greater philadelphia area. connection: 500+', 'score': 0.6628}
5. {'id': 99, 'text': 'seeking human resources position. location: las vegas, nevada area. connection: 48', 'score': 0.6511}

Answer the query: "Which candidates are most suitable for Aspiring human resources” or “seeking human resources in new york city area?"

Structured top candidates for reranking:

#### Load cross encoder for re_ranking

In [21]:
# Linux path to the saved model
local_path = "/mnt/c/Users/USER/Documents/Potential_Talent/Rag_project/cross_encoder"

# Load the model from local directory
reranker = CrossEncoder(local_path)
print("mode loaded")

mode loaded


#### create profile/ query pairs

In [27]:
# Prepare query-candidate pairs
query = "Which candidates are most suitable for Aspiring human resources” or “seeking human resources in new york city area?"
rerank_pairs = [(query, candidate['text']) for candidate in faiss_top_candidates]
pprint(rerank_pairs)

[('Which candidates are most suitable for Aspiring human resources” or '
  '“seeking human resources in new york city area?',
  'aspiring human resources specialist. location: greater new york city area. '
  'connection: 1'),
 ('Which candidates are most suitable for Aspiring human resources” or '
  '“seeking human resources in new york city area?',
  'aspiring human resources professional | passionate about helping to create '
  'an inclusive and engaging work environment. location: new york, new york. '
  'connection: 212'),
 ('Which candidates are most suitable for Aspiring human resources” or '
  '“seeking human resources in new york city area?',
  'seeking human resources opportunities. location: chicago, illinois. '
  'connection: 390'),
 ('Which candidates are most suitable for Aspiring human resources” or '
  '“seeking human resources in new york city area?',
  'seeking human resources hris and generalist positions. location: greater '
  'philadelphia area. connection: 500+'),


####  Re-ranking Function

In [28]:

def rerank_candidates(query, faiss_top_candidates, reranker):
    """
    Rerank FAISS retrieved candidates using a CrossEncoder reranker.

    query: str, the user's query
    faiss_top_candidates: list of dicts, output from FAISS retrieval
                          Each dict must have at least 'id' and 'text'
    reranker: CrossEncoder model object

    Returns: list of dicts, candidates sorted by reranker score
    """
    # Prepare query-candidate pairs
    rerank_pairs = [(query, candidate['text']) for candidate in faiss_top_candidates]

    # Predict reranker scores
    rerank_scores = reranker.predict(rerank_pairs)

    # Attach reranker scores to candidate dicts
    for candidate, score in zip(faiss_top_candidates, rerank_scores):
        candidate['rerank_score'] = score

    # Sort candidates by reranker score (highest first)
    reranked_candidates = sorted(faiss_top_candidates, key=lambda x: x['rerank_score'], reverse=True)

    return reranked_candidates

#### call the function

In [29]:
query = "Which candidates are most suitable for Aspiring human resources” or “seeking human resources in new york city area?"
reranked = rerank_candidates(query, faiss_top_candidates, reranker)
print("\nReranked candidates:\n")
pprint(reranked)


Reranked candidates:

[{'id': 6,
  'rerank_score': 4.374959,
  'score': 0.7901,
  'text': 'aspiring human resources specialist. location: greater new york '
          'city area. connection: 1'},
 {'id': 76,
  'rerank_score': 2.8240962,
  'score': 0.689,
  'text': 'aspiring human resources professional | passionate about helping to '
          'create an inclusive and engaging work environment. location: new '
          'york, new york. connection: 212'},
 {'id': 10,
  'rerank_score': 0.4661007,
  'score': 0.6628,
  'text': 'seeking human resources hris and generalist positions. location: '
          'greater philadelphia area. connection: 500+'},
 {'id': 99,
  'rerank_score': -1.3391613,
  'score': 0.6511,
  'text': 'seeking human resources position. location: las vegas, nevada area. '
          'connection: 48'},
 {'id': 28,
  'rerank_score': -1.6673145,
  'score': 0.6639,
  'text': 'seeking human resources opportunities. location: chicago, illinois. '
          'connection: 390'}]


#### Load llama model and tokenizer

In [32]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,   # reduce memory
    device_map={"": "cpu"},      # FORCE CPU, no disk
    low_cpu_mem_usage=True,
    use_safetensors=True
)

model.eval()
print("LLaMA model loaded successfully")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LLaMA model loaded successfully


#### Answering & reasoning function (Generator)

In [43]:
def generate_answer_from_reranked(query, reranked_candidates, top_n=3, max_gen_tokens=200):

    # HARD FILTER (recommended)
    filtered = [
        c for c in reranked_candidates
        if "new york" in c["text"].lower()
    ]

    top_context = filtered[:top_n]

    candidate_text = ""
    for c in top_context:
        candidate_text += f"- ID {c['id']}: {c['text']}\n"

    prompt = f"""
You are an HR recruitment assistant.

Task:
From the candidate profiles below, identify the most suitable candidates for the query.

Rules:
- Location MUST match the query location.
- Focus on job intent and location relevance.
- Do not consider connection.
- Do NOT write code.
- Be concise.
- List candidate IDs and a short reason.

Candidate Profiles:
{candidate_text}

Query:
{query}

Answer:
"""

    inputs = tokenizer( prompt, return_tensors="pt", truncation=True).to(model.device)

    output_ids = model.generate(**inputs, max_new_tokens=max_gen_tokens, do_sample=False,temperature=0.0,  pad_token_id=tokenizer.eos_token_id)

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

#### call the function

In [44]:
final_answer = generate_answer_from_reranked( query=query, reranked_candidates=reranked, top_n=5)
print("\nFinal Answer:\n")
print(final_answer)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Final Answer:


You are an HR recruitment assistant.

Task:
From the candidate profiles below, identify the most suitable candidates for the query.

Rules:
- Location MUST match the query location.
- Focus on job intent and location relevance.
- Do not consider connection.
- Do NOT write code.
- Be concise.
- List candidate IDs and a short reason.

Candidate Profiles:
- ID 6: aspiring human resources specialist. location: greater new york city area. connection: 1
- ID 76: aspiring human resources professional | passionate about helping to create an inclusive and engaging work environment. location: new york, new york. connection: 212


Query:
Which candidates are most suitable for Aspiring human resources” or “seeking human resources in new york city area?

Answer:
- ID 76: location matches, and job intent is clear.
- ID 6: location matches, and job intent is clear.

Both candidates are suitable for the query. 

However, since the query is asking for candidates in the New York City ar