In [1]:
import os
import glob
import re
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import time

In [2]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
    # device = torch.device("cpu")
    print("MPS device is available")
else:
    print("MPS device is not available")
    device = torch.device("cpu")

MPS device is available


In [3]:
md_paths = '../../../../converted_nuno'
json_paths = '../../../../extracted'

In [4]:
# Use glob to list all .md files in the folder
markdown_files = glob.glob(os.path.join(md_paths, "*.md"))
np.random.shuffle(markdown_files)
os.makedirs(json_paths, exist_ok=True)

In [5]:
markdown_files

['../../../../converted_nuno/821473012_arso_2020.md',
 '../../../../converted_nuno/814190692_arso_2020.md',
 '../../../../converted_nuno/892378452_arso_2020.md',
 '../../../../converted_nuno/911668807_arso_2020.md',
 '../../../../converted_nuno/893519432_arso_2020.md',
 '../../../../converted_nuno/814343502_arso_2020.md',
 '../../../../converted_nuno/839263422_arso_2020.md',
 '../../../../converted_nuno/818659652_arso_2020.md',
 '../../../../converted_nuno/866288232_arso_2020.md',
 '../../../../converted_nuno/888892702_arso_2020.md',
 '../../../../converted_nuno/854052322_arso_2020.md',
 '../../../../converted_nuno/866767882_arso_2020.md',
 '../../../../converted_nuno/891864302_arso_2020.md',
 '../../../../converted_nuno/823758502_arso_2020.md',
 '../../../../converted_nuno/888965432_arso_2020.md',
 '../../../../converted_nuno/893271902_arso_2020.md',
 '../../../../converted_nuno/825072292_arso_2020.md',
 '../../../../converted_nuno/819349282_arso_2020.md',
 '../../../../converted_nuno

# model setup

In [6]:
# Specify the path to your LLaMA 3 model directory
model_path = "/Users/nunocalaim/.llama/llama-hf"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Ensure the pad token is set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model = AutoModelForCausalLM.from_pretrained(model_path)
model.resize_token_embeddings(len(tokenizer))

# Set the pad_token_id and eos_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id

# Print token IDs to confirm they are integers
print("pad_token_id:", model.config.pad_token_id)
print("eos_token_id:", model.config.eos_token_id)

# Make sure eos_token_id and pad_token_id are valid integers
assert isinstance(model.config.pad_token_id, int), "pad_token_id should be an integer."
assert isinstance(model.config.eos_token_id, int), "eos_token_id should be an integer."

# Move model to GPU
model.to(device)


pad_token_id: 128009
eos_token_id: 128009


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [7]:
def generate_response(prompt, json_path):
    # Tokenize the input text and create an attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

    # Generate a response using the model
    output = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=300,
        num_return_sequences=1,
        pad_token_id=model.config.pad_token_id  # Use pad_token_id from the model's config
    )

    # Decode and print the output text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Remove the input prompt from the generated response
    generated_text = generated_text[len(prompt)-100:].strip()

    # print("Response:", generated_text)

    # Use regex to extract the JSON block from the response
    json_str_match = re.search(r'{.*}', generated_text, re.DOTALL)

    if json_str_match:
        json_str = json_str_match.group(0)
        # Parse the JSON string
        try:
            json_data = json.loads(json_str)

            # Save the JSON data to a file
            with open(json_path, 'w') as json_file:
                json.dump(json_data, json_file, indent=4)  # Save the JSON with indentation for readability

            print(f"JSON data saved to '{json_path}'")
            print("Extracted JSON:", json_data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            print(f"generated text is {generated_text}")
    else:
        print("No JSON found in the response")
        print(f"generated text is {generated_text}")

In [8]:
def run_model(content, json_path):
    prompt = f"""
    <|system|> You are a helpful assistant. You process tax records of Norwegian companies and extract specific information from them and insert it in a table with a JSON format.
    This is the tax record you will base your answers on: {content}
    You provide your response only in a predefined JSON format and nothing else. Do not include any explanation, comments, or extra information—only the JSON.
    The JSON structure should strictly follow this format:
    "json"
    {{
        "company_name": "string",
        "company_id": "string",
        "company_address": "string",
        "leadership": {{
            "CEO": "string or null",
            "board_members": [
                "string or null"
            ],
            "chairman_of_the_board": "string or null"
        }},
        "subsidiaries": [
            "string or null"
        ],
        "parent_company": "string or null",
        "property_ownership": {{
        "owns": ["property1", "property2", ...],
        "rents": ["property1", "property2", ...]   }}
        "number_of_employees": "integer or null" ,
        "profit_status": "boolean or null",
    }}
    <|user|> I want to extract the following information from the tax record:
        - "company_name": The name of the company.
        - "company_id": The company tax number.
        - "company_address": The company's address.
        - "leadership": A list of shareholder names. Including a nested dictionary with the CEO, Board members and chairman of the board as follows:
            -"CEO": Name of the CEO.
            -"board_members": A list of the board members.
            -"chairman_of_the_board": Name of the chairman of the board of the company.
        - "subsidiaries": A list of the subsidiaries of the company.
        - "parent_company": Name of the parent company.
        - "property_ownership": A dictionary with two keys, "owns" and "rents", where each contains a list of properties the company owns or rents, respectively. If there are no properties, return an empty list for that category.
        - "number_of_employees": The total number of employees.
        - "profit_status": A bolean indicating whether the company made profits.
    Remember, only return the JSON. No explanations.
    <|assistant|>
    """

    generate_response(prompt, json_path)

In [9]:
def process_md(path):
    match = re.search(r"/(\d+)_arso_2020", path)
    print(match)
    if match:
        id = match.group(1)  # group(1) will contain everything before "_arso_2020"
    else:
        return None
    json_path = f"{json_paths}/{id}.json"
    if not os.path.exists(json_path):
        print(f"processing {path}...")
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()
        run_model(content, json_path)

In [10]:
# for path in markdown_files:
#     start_time = time.time()  # Record the start time
#     print(path)
#     process_md(path)
#     time.sleep(4)

In [11]:

for path in markdown_files:
    start_time = time.time()  # Record the start time
    print(path)
    process_md(path)  # Your function to process the markdown file
    end_time = time.time()  # Record the end time
    elapsed_time = end_time - start_time  # Calculate the elapsed time
    print(f"Time taken for {path}: {elapsed_time:.2f} seconds")
    time.sleep(4)

../../../../converted_nuno/821473012_arso_2020.md
<re.Match object; span=(26, 46), match='/821473012_arso_2020'>
processing ../../../../converted_nuno/821473012_arso_2020.md...


RuntimeError: MPS backend out of memory (MPS allocated: 13.83 GB, other allocations: 2.73 MB, max allowed: 18.13 GB). Tried to allocate 7.83 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
markdown_files