In [3]:
import pandas as pd
import aisuite as ai
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import display, Markdown
import os
import json

paths = {
    'root':Path.cwd().parent,
    'data':Path.cwd().parent / "data"
}

# Load the environment variable from a file config\credentials.json
ai.utils.load_dotenv()

with open('config/credentials.json') as f:
    credentials = json.load(f)

if "HUGGINGFACE_TOKEN" in os.environ or "HUGGINGFACE_TOKEN" in credentials:
    print("Environment variable HUGGINGFACE_TOKEN set.")

model_name = "microsoft/Phi-3-mini-128k-instruct"

if not "model" in locals():
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


Environment variable HUGGINGFACE_TOKEN set.


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.96s/it]


In [4]:
# Load the candidate data from the parquet file

try:
	data = pd.read_parquet(paths['data'] / "interim/encoded.parquet")
except Exception as e:
	print(f"Failed to load parquet file: {e}. Loading CSV instead.")
	data = pd.read_csv(paths['data'] / "interim/encoded.csv")
print("Loaded candidate data.")
data.head()

Loaded candidate data.


Unnamed: 0,job_title,location,connection
0,ct bauer college of business graduate magna c...,"houston, texas",85
1,native english teacher at tech english program...,kanada,501
2,aspiring human resources professional,"raleigh-durham, north carolina",44
3,people development coordinator at not tech,"denton, texas",501
4,advisory board member at celal bayar university,"i̇zmir, türkiye",501


In [None]:
search_phrase = 'aspiring human resources'
location = "New york"

# Define the task for the model
instructions = "Rank the candidates based on their job_title against our search term using cosine similarity. The higher the score, the better the match. Include the cosine similarity scores. Return the top 5 candidates in markdown format. Do not show intermediary responses, nor the reasoning, only show the final table result."

# Format the data for the model
data_sample = data['job_title'].sample(15, random_state=42).to_list()

# 
messages = f"Instructions: {instructions}\n\nsearch term:{search_phrase}\n\nCandidates: {data_sample}"

inputs = tokenizer(messages, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs, max_length=800+1)
display(Markdown(tokenizer.decode(outputs[0], skip_special_tokens=True)))

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
You are not running the flash-attention implementation, expect numerical differences.


Instructions: Rank the candidates based on their job_title against our search term using cosine similarity. The higher the score, the better the match. Include the cosine similarity scores. Return the top 5 candidates in markdown format. Do not show intermediary responses, nor the reasoning, only show the final table result.

search term:aspiring human resources

Candidates: ['human resources generalist at scottmadden inc', 'admissions representative at community medical center long beach', 'seeking human resources position', 'aspiring human resources management student seeking an internship', 'student at westfield state university', 'aspiring human resources specialist', 'director of human resources north america groupe not tech', 'business intelligence and analytics at not techs', 'people development coordinator at not tech', 'human resources professional for the world leader in techs software', 'seeking human resources opportunities', 'seeking human resources human resourcesis and generalist positions', 'human resources generalist at schwans', 'student at humber college and aspiring human resources generalist', 'information systems specialist and programmer with a love for data and organization'] 

### Answer:

| Rank | Candidate                                                                                   | Cosine Similarity Score |
|------|--------------------------------------------------------------------------------------------|-------------------------|
| 1    | human resources generalist at scottmadden inc                                             | 0.92                    |
| 2    | aspiring human resources management student seeking an internship                        | 0.89                    |
| 3    | human resources generalist at schwans                                                     | 0.87                    |
| 4    | seeking human resources opportunities                                                     | 0.85                    |
| 5    | human resources generalist at not tech                                                    | 0.83                    | 
| 6    | human resources professional for the world leader in techs software                       | 0.81                    |
| 7    | human resources generalist at not tech                                                    | 0.79                    |
| 8    | human resources generalist at not tech                                                    | 0.78                    |
| 9    | human resources generalist at not tech                                                    | 0.77                    |
| 10   | human resources generalist at not tech                                                    | 0.76                    |
| 11   | human resources generalist at not tech                                                    | 0.75                    |
| 12   | human resources generalist at not tech                                                    | 0.74                    |
| 13   | human resources generalist at not tech                                                    | 0.73                    |
| 14   | human resources generalist at not tech                                                    | 0.72                    |
| 15   | human resources generalist at not tech                                                    | 0.71                    |
| 16   | human resources generalist at not tech                                                    | 0.70                    |
| 17   | human resources generalist at not tech                                                    | 0.69                    |
| 18   | human resources generalist at not tech                                                    | 0.68                    |
| 19   | human resources generalist at not tech                                                    | 0.67                    |
| 20   | human resources generalist at not tech                                                    | 0.66                    |
| 21   | human resources generalist at not tech                                                    | 0.65                    |
| 22   | human resources generalist at not tech                                                    | 0.64                    |
| 23   | human resources generalist at not tech                                                    | 0.63                    |
| 24   | human resources generalist at not tech                                                    | 0.62                    |
| 25   | human resources generalist at not tech                                                    | 0.61                    |
| 26   | human resources generalist at

### WIP

In [7]:
from transformers import AutoTokenizer
from onnxruntime import InferenceSession
import numpy as np

# Custom path to the ONNX model directory
model_dir = "C:/Users/Guill/.cache/huggingface/hub/models--microsoft--Phi-3-small-128k-instruct-onnx-cuda/snapshots/17f60c4b6f95cc18b4b1e0667e38490deb7d899c/cuda-fp16/"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the ONNX model
onnx_model_path = model_dir + "phi3-small-128k-instruct-cuda-fp16.onnx"
session = InferenceSession(onnx_model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])

# Inspect model inputs
for input_meta in session.get_inputs():
    print(f"Name: {input_meta.name}, Shape: {input_meta.shape}, Type: {input_meta.type}")

# Prepare the input text
input_text = "Rank the candidates based on their qualifications."
inputs = tokenizer(input_text, return_tensors="np")

# Correct dimensions for past_key_values
num_layers = 32  # Number of layers in the transformer
num_heads = 8    # Number of attention heads
seq_len = 1      # Sequence length for past key values
hidden_size_per_head = 128  # Hidden size per attention head

past_key_values = {
    f"past_key_values.{i}.key": np.zeros((1, num_heads, seq_len, hidden_size_per_head), dtype=np.float16)
    for i in range(num_layers)
}
past_key_values.update({
    f"past_key_values.{i}.value": np.zeros((1, num_heads, seq_len, hidden_size_per_head), dtype=np.float16)
    for i in range(num_layers)
})

# Combine input IDs, attention mask, and past_key_values
onnx_inputs = {
    "input_ids": inputs["input_ids"].astype(np.int64),  # Ensure input_ids is int64
    "attention_mask": inputs["attention_mask"].astype(np.int64),  # Ensure attention_mask is int64
}
onnx_inputs.update(past_key_values)

# Perform inference
outputs = session.run(None, onnx_inputs)
print(outputs)




Name: input_ids, Shape: ['batch_size', 'sequence_length'], Type: tensor(int64)
Name: attention_mask, Shape: ['batch_size', 'total_sequence_length'], Type: tensor(int64)
Name: past_key_values.0.key, Shape: ['batch_size', 8, 'past_sequence_length', 128], Type: tensor(float16)
Name: past_key_values.0.value, Shape: ['batch_size', 8, 'past_sequence_length', 128], Type: tensor(float16)
Name: past_key_values.1.key, Shape: ['batch_size', 8, 'past_sequence_length', 128], Type: tensor(float16)
Name: past_key_values.1.value, Shape: ['batch_size', 8, 'past_sequence_length', 128], Type: tensor(float16)
Name: past_key_values.2.key, Shape: ['batch_size', 8, 'past_sequence_length', 128], Type: tensor(float16)
Name: past_key_values.2.value, Shape: ['batch_size', 8, 'past_sequence_length', 128], Type: tensor(float16)
Name: past_key_values.3.key, Shape: ['batch_size', 8, 'past_sequence_length', 128], Type: tensor(float16)
Name: past_key_values.3.value, Shape: ['batch_size', 8, 'past_sequence_length', 128

RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Cast node. Name:'InsertedPrecisionFreeCast_model.layers.10.mlp.down_proj.MatMul.weight' Status Message: bad allocation

In [9]:
import pandas as pd
import requests
import json

# Load the candidate data from the parquet file
data_path = "../data/interim/encoded.parquet"
data = pd.read_parquet(data_path)
print("Loaded candidate data.")
print(data.head())

# Define the task for the model
instructions = "Rank the candidates based on their job_title, location, and connection fields. Assign a fit score between 1 (low fit) and 10 (high fit) based on relevance to the desired profile."

# Format the data for the model
data_sample = data[['job_title', 'location', 'connection']].head(5).to_dict(orient='records')

messages = [
    {"role": "system", "content": "You are a ranking assistant for hiring. Evaluate candidates based on the following instructions."},
    {"role": "user", "content": f"Instructions: {instructions}\n\nCandidates:\n{data_sample}"},
]

# Send a request to the locally running vLLM server
url = "http://localhost:8000/v1/chat/completions"
payload = {
    "model": "microsoft/Phi-3-small-128k-instruct-onnx-cuda",
    "messages": messages
}
headers = {"Content-Type": "application/json"}

response = requests.post(url, headers=headers, json=payload)
if response.status_code == 200:
    print("Model response:")
    response_text = response.json()["choices"][0]["message"]["content"]
    print(response_text)
else:
    print(f"Failed to get response: {response.status_code}, {response.text}")

# Process the response (if needed) and integrate the scores back into the dataset
# This assumes the model returns a list of scores corresponding to the candidates
# (Implement response parsing logic here, if needed)

Loaded candidate data.
                                           job_title  \
0   ct bauer college of business graduate magna c...   
1  native english teacher at tech english program...   
2              aspiring human resources professional   
3         people development coordinator at not tech   
4    advisory board member at celal bayar university   

                          location  connection  
0                   houston, texas          85  
1                           kanada         501  
2  raleigh-durham, north carolina           44  
3                    denton, texas         501  
4                  i̇zmir, türkiye         501  


ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000017794D41210>: Failed to establish a new connection: [WinError 10061] No se puede establecer una conexión ya que el equipo de destino denegó expresamente dicha conexión'))

In [None]:
# Example: Add a 'fit_score' column based on dummy logic for demonstration
data['fit_score'] = [8, 7, 5, 6, 9]  # Replace with actual parsed scores

print(data.head())

# Save the ranked data for further analysis
output_path = "data/interim/ranked_candidates.parquet"
data.to_parquet(output_path, index=False)
print(f"Ranked candidates saved to {output_path}.")