## SET UP

### DL LIBRARIES

In [None]:
!python -m pip install --upgrade pip
!pip install . -e

!pip install guidance==0.1.10
# !pip install datasets
!pip install -U scipy
!pip install git+https://github.com/huggingface/transformers.git -U #Necessary for merging LoRA adapters onto quantized models.
# !pip install -q -U transformers # if you are facing issues with the dev branch above
!pip install -U bitsandbytes
# !pip install -U peft
!pip install -U accelerate
# !pip install torch==2.1.2
!pip install flash-attn -U
!pip install sqlalchemy pandas psycopg2-binary huggingface_hub -U
!pip install huggingface-cli

### LOAD LIBRARIES

In [1]:
import pandas as pd
import torch
import time
import json

import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import JSONB, TIMESTAMP
from sqlalchemy.types import Integer, Text

from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from tqdm import tqdm

from huggingface_hub import snapshot_download, notebook_login
from guidance import models, gen, select

from jobsearch.params import *
from jobsearch.database import *
from jobsearch.utils import read_file, create_tagged_prompts

In [2]:
!huggingface-cli login --token $HF_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/axelus/.cache/huggingface/token
Login successful


### SET UP VARS

In [3]:
# REPO_ID = "axel-rda/Mistral-7B-Instruct-v0.1-qlora-sft"
# REPO_ID = "axel-rda/ARIA-70B-V3-bnb-4bit-nf4-bfloat16-qlora-sft" ## fine-tuing V1
# REPO_ID = "axel-rda/ARIA-70B-V3-bnb-4bit-nf4-bfloat16-qlora-sft-qlora-sft-ft_num-2" ## fine-tuning V2
# REPO_ID = "axel-rda/ARIA-70B-V3-4.0bpw-exl2"
REPO_ID = "openai-community/gpt2"
# REPO_ID = "openai-community/gpt2-medium"

REPO_USER_NAME = REPO_ID.split('/')[0]
LOCAL_SAVE_DIR = "../../data/models/" + REPO_ID.split('/')[1]

### DL DATA

In [4]:
data=fetch_data_from_postgresql(import_from_cloud=False)

Using LOCAL postgres database ...


In [5]:
df = data.copy()

In [7]:
file_path = "../../data/prompts/salary_extraction_instructions.txt"
instructions = read_file(file_path)

### DL MODEL AND TOKENIZER

In [8]:
snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_SAVE_DIR, cache_dir=LOCAL_SAVE_DIR, resume_download=False, max_workers=13, ignore_patterns=["onnx/*", "*.bin", "*.tflite", "*.h5", "*.ot", "*.onnx", "*.msgpack"])

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

onnx/tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

onnx/generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

onnx/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

onnx/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

onnx/special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

onnx/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

'/home/axelus/data_science_projects/gg_job_search/data/models/gpt2'

## CLASSIC INFERENCE  

### LOAD MODEL AND TOKENIZER WITH HF AUTOCLASS

In [23]:
model = AutoModelForCausalLM.from_pretrained(LOCAL_SAVE_DIR, device_map='auto', cache_dir=LOCAL_SAVE_DIR, torch_dtype=torch.bfloat16)#, attn_implementation="flash_attention_2")
tokenizer = AutoTokenizer.from_pretrained(LOCAL_SAVE_DIR, use_fast=True)

print(model.generation_config)
print(model.config)

model.n_ctx == 2048

GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

GPT2Config {
  "_name_or_path": "../../data/models/gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "bfloat16",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 

In [None]:
model = models.Transformers(LOCAL_SAVE_DIR, device_map='auto', do_sample=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", temperature=0.01)
# tokenizer = AutoTokenizer.from_pretrained(LOCAL_SAVE_DIR, use_fast=True)

### SET UP DATASET / BATCH PARAMETERS

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")
        return {
            "input_ids": inputs["input_ids"].squeeze(),  # Remove batch dimension
            "attention_mask": inputs["attention_mask"].squeeze()
        }
        
def pad_collate(batch):
    input_ids = pad_sequence([item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([item["attention_mask"] for item in batch], batch_first=True, padding_value=0)
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }


In [None]:
MAX_LENGTH=4096
BATCH_SIZE = 4
tokenizer.pad_token = '<unk>'

# dataset = TextDataset(prompts, tokenizer, MAX_LENGTH)
# dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)

### INFERENCE

In [None]:
# generated_sequences = []
# with torch.no_grad():
#     for batch in tqdm(dataloader, desc="Generating text"):
#         input_ids = batch["input_ids"].to("cuda")
#         # attention_mask = batch["attention_mask"].to("cuda")

#         # Generate sequences for the current batch
#         generated = model.generate(input_ids=input_ids, max_new_tokens=250, temperature=0.01)

#         # Process each item in the batch
#         for input_id, gen in zip(input_ids, generated):
#             # Find the length of the input (excluding padding)
#             input_len = torch.sum(input_id != tokenizer.pad_token_id).item()  # Count non-pad tokens

#             # Decode only the generated part, not the original prompt
#             generated_part = gen[input_len:]  # Skip over input tokens
#             generated_text = tokenizer.decode(generated_part, skip_special_tokens=True)
#             generated_sequences.append(generated_text)


inputs = []
outputs = []

streamer = TextStreamer(tokenizer, skip_prompt=False)

descriptions = df.description.sample(4).to_list()

prompts = create_tagged_prompts(instructions, descriptions)

tokenized_inputs = tokenizer(prompts, max_length=MAX_LENGTH, padding=True, truncation=True, return_attention_mask=True)

input_ids = torch.tensor(tokenized_inputs['input_ids'][0]).unsqueeze(0).to('cuda')
attention_mask = torch.tensor(tokenized_inputs['attention_mask'][0]).unsqueeze(0).to('cuda')

# Now you can call generate with both input_ids and attention_mask
output = model.generate(input_ids, attention_mask=attention_mask, streamer=streamer, max_new_tokens=100, temperature=0.01)
print('End of text generation')

# generated_tokens = output[0][len(input_ids[0]):]

inputs.append(input)
outputs.append(output)

## INFERENCE WITH GUIDANCE

In [25]:
model = models.Transformers(LOCAL_SAVE_DIR)

In [18]:
def salary_json_extractor(model, instructions, input):

  lm = model + f"{instructions} {input}"
  
  frequence_versement = ["jour", "semaine", "mois", "an"]
  devise = ["€", "$", "£"]

  only_float_pattern = "\d+\.\d{1,2}|null"
  lm += f"""
  {{
    "salaire": [
      {{
        "montant_min": "{gen(only_float_pattern, temperature=0.01)}",
        "montant_max": "{gen(only_float_pattern, temperature=0.01)}",
        "devise": "{select(devise, name='devise')}",
        "frequence_versement" : "{select(frequence_versement, name='frequence_versement')}"
      }}
    ]
  }}
  """
  return lm

In [19]:
outputs = []
for input in df.sample(5)['description'].to_list():
    output = salary_json_extractor(model, instructions, input)
    outputs.append(output)

2024-04-22 17:43:24.028841: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 17:43:24.028980: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 17:43:24.079842: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-22 17:43:24.183685: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Token indices sequence length is longer than the spec

AssertionError: 

In [None]:
str(outputs[2]).split("Commence immédiatement ta réponse en JSON valide après la prochaine offre d'emploi :")[1].split("{\n    "salaire"")[0]