In [None]:
import pandas as pd
from pathlib import Path
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
import os
import json

# Paths
paths = {
    'root': Path.cwd().parent,
    'data': Path.cwd().parent / "data",
    "config": Path.cwd().parent / "config"
}

with open(paths["root"] / 'config/credentials.json') as f:
    credentials = json.load(f)

if "HUGGINGFACE_TOKEN" in os.environ or "HUGGINGFACE_TOKEN" in credentials:
    print("Environment variable HUGGINGFACE_TOKEN set.")


# Define the file path
file_path = paths["config"] / "instructions.txt"

try:
    # Open the file and read its content
    with open(file_path, 'r') as file:
        instructions = file.read()
        print("Instructions successfully read!")
        # print(instructions)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


# Shortlisted candidates
try:
    data = pd.read_parquet(paths['data'] / "interim/encoded.parquet")
except Exception as e:
    print(f"Failed to load parquet file: {e}. Loading CSV instead.")
    data = pd.read_csv(paths['data'] / "interim/encoded.csv")


In [None]:
# Generate results
search_phrase = 'aspiring human resources'
location = "New York"

# Attempting to load the instructions from the specified file path.
try:
    with open("../config/instructions.txt", "r") as file:
        instructions = file.read()
except FileNotFoundError:
    instructions = "Error: The file '../config/instructions.txt' was not found."
    raise FileNotFoundError

# Format inputs
data_sample = data['job_title'].sample(15, random_state=42).to_list()
messages = f"# Instructions: {instructions}\n\n# Search term:\n{search_phrase}\n\n # Candidates: {data_sample}"


# Generate a response
outputs = model.generate(**inputs, max_length=800+1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [7]:
# %%
import pandas as pd
from pathlib import Path
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
import os
import json

# Paths
paths = {
    'root': Path.cwd().parent,
    'data': Path.cwd().parent / "data",
    "config": Path.cwd().parent / "config"
}

# Load Hugging Face credentials
with open(paths["config"] / 'credentials.json') as f:
    credentials = json.load(f)

if "HUGGINGFACE_TOKEN" in os.environ or "HUGGINGFACE_TOKEN" in credentials:
    print("Environment variable HUGGINGFACE_TOKEN set.")

# Load instructions
file_path = paths["config"] / "instructions.txt"
try:
    with open(file_path, 'r') as file:
        instructions = file.read()
        print("Instructions successfully read!")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    instructions = None

# Load shortlisted candidates
try:
    data = pd.read_parquet(paths['data'] / "interim/encoded.parquet")
except Exception as e:
    print(f"Failed to load parquet file: {e}. Loading CSV instead.")
    data = pd.read_csv(paths['data'] / "interim/encoded.csv")

# Initialize the model and tokenizer
model_name = "microsoft/Phi-3-mini-128k-instruct"
# Adjust model loading for GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    trust_remote_code=True,
    device_map="auto",  # Use GPU if available
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# Generate results
search_phrase = 'aspiring human resources'
location = "New York"

# Sample data
data_sample = data['job_title'].sample(5, random_state=42).to_list()

# Prepare messages in the format required by the model
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": f"Search for top candidates in or close to {location} with interest in {search_phrase}.\n\nCandidates:\n{data_sample}. Respond in Markdown format."},
]

# Generate text
generation_args = {
    "max_new_tokens": 100,
    "return_full_text": False,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
dusplay(Markdown("Generated Output:", output[0]['generated_text']))

Environment variable HUGGINGFACE_TOKEN set.
Instructions successfully read!


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.33it/s]
Some parameters are on the meta device because they were offloaded to the cpu and disk.
Device set to use cpu


KeyboardInterrupt: 