In [8]:
import pandas as pd
from pathlib import Path
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
import os
import json

# Paths
paths = {
    'root': Path.cwd().parent,
    'data': Path.cwd().parent / "data",
    "config": Path.cwd().parent / "config"
}

with open(paths["root"] / 'config/credentials.json') as f:
    credentials = json.load(f)

if "HUGGINGFACE_TOKEN" in os.environ or "HUGGINGFACE_TOKEN" in credentials:
    print("Environment variable HUGGINGFACE_TOKEN set.")


# Define the file path
file_path = paths["config"] / "instructions.txt"

try:
    # Open the file and read its content
    with open(file_path, 'r') as file:
        instructions = file.read()
        print("Instructions successfully read!")
        # print(instructions)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Environment variable HUGGINGFACE_TOKEN set.
Instructions successfully read!


In [1]:
# Model initialization
model_name = "microsoft/Phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# Load data
try:
    data = pd.read_parquet(paths['data'] / "interim/encoded.parquet")
except Exception as e:
    print(f"Failed to load parquet file: {e}. Loading CSV instead.")
    data = pd.read_csv(paths['data'] / "interim/encoded.csv")


# Tokenize the dataset
def tokenize_function(batch):
    return tokenizer(batch["job_title"],
                     padding="max_length",
                     truncation=True, max_length=256)

NameError: name 'AutoTokenizer' is not defined

In [1]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

# Load the model and tokenizer
model_name = "microsoft/Phi-3-mini-128k-instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use end-of-sequence token as the pad token

# Assume `data` is a pandas DataFrame with a "job_title" column
# Tokenize the data["job_title"]
tokenized_data = [
    tokenizer(
        job,
        max_length=128,  # Adjust based on expected input length
        truncation=True,
        padding="max_length",  # Ensures all sequences are of the same length
        return_tensors="pt"
    )
    for job in data["job_title"]
]

# Convert tokenized data to a format suitable for training
input_ids = [item["input_ids"].squeeze(0) for item in tokenized_data]
attention_masks = [item["attention_mask"].squeeze(0) for item in tokenized_data]

# Prepare tokenized data as a list of dictionaries
tokenized_dataset = [{"input_ids": ids, "attention_mask": mask} for ids, mask in zip(input_ids, attention_masks)]

# Define a custom dataset class
class TokenizedDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]

# Initialize the dataset and data loader
train_dataset = TokenizedDataset(tokenized_dataset)

# Define training arguments
model_dir = "./models/Phi3-mini/"
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=5000,
    save_total_limit=2,
    report_to="none",
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

# Generate results
search_phrase = 'aspiring human resources'
location = "New York"

# Define the task for the model
instructions = "Rank the candidates based on their job_title against our search term using cosine similarity. The higher the score, the better the match. Include the cosine similarity scores. Return the top 5 candidates in markdown format. Do not show intermediary responses, nor the reasoning, only show the final table result."

# Format inputs
data_sample = data['job_title'].sample(15, random_state=42).to_list()
messages = f"Instructions: {instructions}\n\nsearch term:{search_phrase}\n\nCandidates: {data_sample}"
inputs = tokenizer(messages, return_tensors="pt", padding=True, truncation=True)

# Generate a response
outputs = model.generate(**inputs, max_length=800+1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards:   0%|          | 0/2 [00:08<?, ?it/s]


OSError: [Errno 28] No space left on device