#### Ranking with Qwen using prompt method

#### Load libraries

In [1]:
import pandas as pd
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

#### Load Dataset

In [3]:
file_path = r"C:\Users\USER\Documents\Potential_Talent\potential-talents - Aspiring human resources - seeking human resources.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


#### Drop fit column

In [5]:
df = df.drop(columns = ['fit'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


#### Create text column

In [7]:

df["text"] = (
    df["job_title"].astype(str).str.lower() + ", " 
    + df["location"].astype(str).str.lower() 
    + " (connections: " + df["connection"].astype(str) + ")"
)

df.head()

Unnamed: 0,id,job_title,location,connection,text
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,2019 c.t. bauer college of business graduate (...
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,native english teacher at epik (english progra...
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,"aspiring human resources professional, raleigh..."
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,"people development coordinator at ryan, denton..."
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,advisory board member at celal bayar universit...


#### Extract text column from df

In [9]:
text_series = df["text"]

for t in text_series.head():
    print(t)

2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional, houston, texas (connections: 85)
native english teacher at epik (english program in korea), kanada (connections: 500+ )
aspiring human resources professional, raleigh-durham, north carolina area (connections: 44)
people development coordinator at ryan, denton, texas (connections: 500+ )
advisory board member at celal bayar university, i̇zmir, türkiye (connections: 500+ )


#### Split text series into chunks such that each chunk has 8 rows

In [11]:
chunk_size = 8
chunks = [text_series[i:i + chunk_size] for i in range(0, len(text_series), chunk_size)]

print(f"Number of chunks: {len(chunks)}")

for idx, chunk in enumerate(chunks):
    print(f"\nChunk {idx+1} (size {len(chunk)}):")
    for item in chunk:
        print(item)

Number of chunks: 13

Chunk 1 (size 8):
2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional, houston, texas (connections: 85)
native english teacher at epik (english program in korea), kanada (connections: 500+ )
aspiring human resources professional, raleigh-durham, north carolina area (connections: 44)
people development coordinator at ryan, denton, texas (connections: 500+ )
advisory board member at celal bayar university, i̇zmir, türkiye (connections: 500+ )
aspiring human resources specialist, greater new york city area (connections: 1)
student at humber college and aspiring human resources generalist, kanada (connections: 61)
hr senior specialist, san francisco bay area (connections: 500+ )

Chunk 2 (size 8):
student at humber college and aspiring human resources generalist, kanada (connections: 61)
seeking human resources hris and generalist positions, greater philadelphia area (connections: 500+ )
student at chapman universit

#### Load pretrained model and tokenizer

In [13]:

save_directory = r"C:\Users\USER\Documents\Potential Talent\Qwen_2.5-3b"

tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForCausalLM.from_pretrained(
    save_directory,
    torch_dtype=torch.float32,
    device_map=None
).to("cpu")
print("model and tokenizer loaded")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

model and tokenizer loaded


#### Define keywords and state instructions

In [15]:

keywords = "aspiring human resources OR seeking human resources"

chunk_instructions = []

for chunk_idx, chunk in enumerate(chunks, start=1):

    instruction = f"""
Rank each candidate for relevance to: "{keywords}"

IMPORTANT:
- Output ONLY a JSON array.
- Each item must have "index" and "score".
- Score must be between 0 and 1.
- Include ALL candidates.
- Round scores to EXACTLY 2 decimals.
- Do NOT truncate output.
- Do NOT duplicate candidate

Example (do NOT copy scores):
[
  {{"index": 1, "score": <score>}},
  {{"index": 2, "score": <score>}}
]

Candidates:
"""

    for i, text in enumerate(chunk, start=1):
        instruction += f"{i}. {text}\n"

    chunk_instructions.append(instruction)

    print(f"\n--- Chunk {chunk_idx} Instruction ---")
    print(instruction)


--- Chunk 1 Instruction ---

Rank each candidate for relevance to: "aspiring human resources OR seeking human resources"

IMPORTANT:
- Output ONLY a JSON array.
- Each item must have "index" and "score".
- Score must be between 0 and 1.
- Include ALL candidates.
- Round scores to EXACTLY 2 decimals.
- Do NOT truncate output.
- Do NOT duplicate candidate

Example (do NOT copy scores):
[
  {"index": 1, "score": <score>},
  {"index": 2, "score": <score>}
]

Candidates:
1. 2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional, houston, texas (connections: 85)
2. native english teacher at epik (english program in korea), kanada (connections: 500+ )
3. aspiring human resources professional, raleigh-durham, north carolina area (connections: 44)
4. people development coordinator at ryan, denton, texas (connections: 500+ )
5. advisory board member at celal bayar university, i̇zmir, türkiye (connections: 500+ )
6. aspiring human resources speci

#### Apply the model and tokenizer to the chunks

In [17]:
# List to store decoded outputs for each chunk
chunk_outputs = []

# Loop over stored instructions
for chunk_idx, instruction in enumerate(chunk_instructions, start=1):
    # Build messages for the LLM
    messages = [
        {"role": "system", "content": "You are an expert HR ranking system. Output ONLY valid JSON."},
        {"role": "user", "content": instruction}
    ]

    # Apply chat template and tokenize
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

    # Generate output
    outputs = model.generate(
        **inputs,
        max_new_tokens=1500,
        temperature=0.0,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode output
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    chunk_outputs.append(decoded)

    # Preview output
    print(f"\n--- RAW OUTPUT for Chunk {chunk_idx} ---")
    print(decoded[:2000])  # show first 2000 chars

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



--- RAW OUTPUT for Chunk 1 ---
system
You are an expert HR ranking system. Output ONLY valid JSON.
user

Rank each candidate for relevance to: "aspiring human resources OR seeking human resources"

IMPORTANT:
- Output ONLY a JSON array.
- Each item must have "index" and "score".
- Score must be between 0 and 1.
- Include ALL candidates.
- Round scores to EXACTLY 2 decimals.
- Do NOT truncate output.
- Do NOT duplicate candidate

Example (do NOT copy scores):
[
  {"index": 1, "score": <score>},
  {"index": 2, "score": <score>}
]

Candidates:
1. 2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional, houston, texas (connections: 85)
2. native english teacher at epik (english program in korea), kanada (connections: 500+ )
3. aspiring human resources professional, raleigh-durham, north carolina area (connections: 44)
4. people development coordinator at ryan, denton, texas (connections: 500+ )
5. advisory board member at celal bayar univer

#### Extract jsons fron the model's response

In [19]:

parsed_chunks = []

for idx, decoded in enumerate(chunk_outputs, start=1):

    print("\n========================")
    print(f"DEBUG for Chunk {idx}")
    print("========================")

    # 1. Find where the assistant's answer begins
    assistant_split = decoded.split("assistant", 1)
    if len(assistant_split) < 2:
        print("ERROR: Could not find assistant response.")
        continue

    assistant_text = assistant_split[1]  # keep only assistant reply

    # 2. Search for JSON *only* inside the assistant section
    match = re.search(r"\[\s*\{.*?\}\s*\]", assistant_text, re.DOTALL)

    if match:
        json_str = match.group(0)
        print("Extracted JSON string:")
        print(json_str)

        try:
            parsed = json.loads(json_str)
            parsed_chunks.append(parsed)
            print("\nParsed JSON OK!")
        except json.JSONDecodeError as e:
            print("JSONDecodeError:", e)
    else:
        print("No JSON found inside assistant reply.")


DEBUG for Chunk 1
Extracted JSON string:
[
  {"index": 1, "score": 0.95},
  {"index": 2, "score": 0.90},
  {"index": 3, "score": 0.85},
  {"index": 4, "score": 0.80},
  {"index": 5, "score": 0.75},
  {"index": 6, "score": 0.70},
  {"index": 7, "score": 0.65},
  {"index": 8, "score": 0.60}
]

Parsed JSON OK!

DEBUG for Chunk 2
Extracted JSON string:
[
  {"index": 1, "score": 0.00},
  {"index": 2, "score": 0.00},
  {"index": 3, "score": 0.00},
  {"index": 4, "score": 0.00},
  {"index": 5, "score": 0.00},
  {"index": 6, "score": 0.00},
  {"index": 7, "score": 0.00},
  {"index": 8, "score": 0.00}
]

Parsed JSON OK!

DEBUG for Chunk 3
Extracted JSON string:
[
  {"index": 1, "score": 0.95},
  {"index": 2, "score": 0.95},
  {"index": 3, "score": 0.95},
  {"index": 4, "score": 0.95},
  {"index": 5, "score": 0.95},
  {"index": 6, "score": 0.95},
  {"index": 7, "score": 0.95},
  {"index": 8, "score": 0.95}
]

Parsed JSON OK!

DEBUG for Chunk 4
Extracted JSON string:
[
  {"index": 1, "score": 0.

##### Merge the extracted jsons back the original dataframe(df)

In [23]:
# Final list 
merged_results = []

# Flatten parsed chunks and attach to original df
global_index = 0

for chunk_idx, chunk_scores in enumerate(parsed_chunks, start=1):

    for item in chunk_scores:
        score = float(item["score"])         # ensure score is float
        df.loc[global_index, "qwen_score"] = score   # correct Pandas assignment
        merged_results.append({
            "id": df.loc[global_index, "id"],
            "text": df.loc[global_index, "text"],
            "qwen_score": score
        })
        global_index += 1

# Preview
#print("\n=== MERGED RESULTS (items) ===")
for r in merged_results[:104]:
    print(r["id"], r["text"], "→", r["qwen_score"])

1 2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional, houston, texas (connections: 85) → 0.95
2 native english teacher at epik (english program in korea), kanada (connections: 500+ ) → 0.9
3 aspiring human resources professional, raleigh-durham, north carolina area (connections: 44) → 0.85
4 people development coordinator at ryan, denton, texas (connections: 500+ ) → 0.8
5 advisory board member at celal bayar university, i̇zmir, türkiye (connections: 500+ ) → 0.75
6 aspiring human resources specialist, greater new york city area (connections: 1) → 0.7
7 student at humber college and aspiring human resources generalist, kanada (connections: 61) → 0.65
8 hr senior specialist, san francisco bay area (connections: 500+ ) → 0.6
9 student at humber college and aspiring human resources generalist, kanada (connections: 61) → 0.0
10 seeking human resources hris and generalist positions, greater philadelphia area (connections: 500+ ) → 0.0
11

#### sort the candidates in ascending order using qwen_score

In [25]:
# Sort by qwen score, highest first
df_sorted = df.sort_values(by="qwen_score", ascending=False).reset_index(drop=True)

# Preview top 20 candidates
df_sorted[["id", "text", "qwen_score"]].head(20)

Unnamed: 0,id,text,qwen_score
0,1,2019 c.t. bauer college of business graduate (...,0.95
1,18,"people development coordinator at ryan, denton...",0.95
2,19,2019 c.t. bauer college of business graduate (...,0.95
3,20,native english teacher at epik (english progra...,0.95
4,21,"aspiring human resources professional, raleigh...",0.95
5,22,"people development coordinator at ryan, denton...",0.95
6,23,advisory board member at celal bayar universit...,0.95
7,24,"aspiring human resources specialist, greater n...",0.95
8,33,"aspiring human resources professional, raleigh...",0.95
9,34,"people development coordinator at ryan, denton...",0.95


#### print full lenght of the text column for starring.

In [27]:
pd.set_option('display.max_colwidth', None)

df_sorted[["id", "text", "qwen_score"]].head(20)

Unnamed: 0,id,text,qwen_score
0,1,"2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional, houston, texas (connections: 85)",0.95
1,18,"people development coordinator at ryan, denton, texas (connections: 500+ )",0.95
2,19,"2019 c.t. bauer college of business graduate (magna cum laude) and aspiring human resources professional, houston, texas (connections: 85)",0.95
3,20,"native english teacher at epik (english program in korea), kanada (connections: 500+ )",0.95
4,21,"aspiring human resources professional, raleigh-durham, north carolina area (connections: 44)",0.95
5,22,"people development coordinator at ryan, denton, texas (connections: 500+ )",0.95
6,23,"advisory board member at celal bayar university, i̇zmir, türkiye (connections: 500+ )",0.95
7,24,"aspiring human resources specialist, greater new york city area (connections: 1)",0.95
8,33,"aspiring human resources professional, raleigh-durham, north carolina area (connections: 44)",0.95
9,34,"people development coordinator at ryan, denton, texas (connections: 500+ )",0.95


#### save it as csv file for reuse

In [29]:
df_sorted[["id", "text", "llm_score"]].head(20).to_csv(r"C:\Users\USER\Documents\Potential Talent\Qwen_2.5-3b\Qwen_top20_candidate.csv", index=False)
print("df_sorted saved")

df_sorted saved
