In [None]:
import pandas as pd
import aisuite as ai
from pathlib import Path
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
import os
import json

# Paths
paths = {
    'root': Path.cwd().parent,
    'data': Path.cwd().parent / "data"
}

with open(paths["root"] / 'config/credentials.json') as f:
    credentials = json.load(f)

if "HUGGINGFACE_TOKEN" in os.environ or "HUGGINGFACE_TOKEN" in credentials:
    print("Environment variable HUGGINGFACE_TOKEN set.")

  from .autonotebook import tqdm as notebook_tqdm


Environment variable HUGGINGFACE_TOKEN set.


: 

In [None]:
# Model initialization
model_name = "microsoft/Phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# Load data
try:
    data = pd.read_parquet(paths['data'] / "interim/encoded.parquet")
except Exception as e:
    print(f"Failed to load parquet file: {e}. Loading CSV instead.")
    data = pd.read_csv(paths['data'] / "interim/encoded.csv")


# Tokenize the dataset
def tokenize_function(batch):
    return tokenizer(batch["job_title"],
                     padding="max_length",
                     truncation=True, max_length=256)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


In [None]:
# Tokenize the data["job_title"]
tokenized_data = [tokenize_function({"job_title": job}) for job in data["job_title"]]

# Convert tokenized data to a format suitable for training
# Ensure tokenized_data contains input_ids and attention_mask
input_ids = [item["input_ids"] for item in tokenized_data]
attention_masks = [item["attention_mask"] for item in tokenized_data]

# Prepare tokenized data as a list of dictionaries
tokenized_dataset = [{"input_ids": ids, "attention_mask": mask} for ids, mask in zip(input_ids, attention_masks)]

# Define training arguments
model_dir = "./models/Phi3/"
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=5000,
    save_total_limit=2,
    report_to="none",
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Convert the tokenized data to a PyTorch DataLoader compatible format
from torch.utils.data import DataLoader

train_dataset = torch.utils.data.Dataset.from_list(tokenized_dataset)
train_loader = DataLoader(train_dataset, batch_size=8, collate_fn=data_collator)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_loader,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

# Generate results
search_phrase = 'aspiring human resources'
location = "New york"

# Define the task for the model
instructions = "Rank the candidates based on their job_title against our search term using cosine similarity. The higher the score, the better the match. Include the cosine similarity scores. Return the top 5 candidates in markdown format. Do not show intermediary responses, nor the reasoning, only show the final table result."

# Format inputs
data_sample = data['job_title'].sample(15, random_state=42).to_list()
messages = f"Instructions: {instructions}\n\nsearch term:{search_phrase}\n\nCandidates: {data_sample}"
inputs = tokenizer(messages, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs, max_length=800+1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
