# **Part 1:  Document Conversion, OCR, and Preprocessing**

In [None]:
!pip install pytesseract pdf2image pillow

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.10


In [None]:
!apt-get update
!apt-get install -y tesseract-ocr
!apt-get install -y tesseract-ocr-deu
!apt-get install -y poppler-utils

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ubuntu.com (91.189.91.80% [Connecting to archive.ubuntu.com (91.189.91.83)] [Waiting for headers] [Connecting to ppa.launch                                                                                                    Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcontent.net (185.125.190.0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcontent.net (185.125.190.                                                                                                    Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [3 InRelease 6,932 B/129 kB 5%] [Connecting to ppa.launchpadcontent.net (18  

In [None]:
!which tesseract

/usr/bin/tesseract


Document Conversion from pdf to txt using OCR

In [None]:
import pytesseract
from pdf2image import convert_from_path
import os

In [None]:
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [None]:
def pdf_to_text(pdf_path, output_txt_path, language='deu'):
    images = convert_from_path(pdf_path)

    full_text = ""

    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image, lang=language)
        full_text += text

    with open(output_txt_path, 'w', encoding='utf-8') as f:
        f.write(full_text)

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
for i in range(1,5):
  pdf_path        = f"/content/drive/MyDrive/sample{i}.pdf"
  output_txt_path = f"/content/drive/MyDrive/output{i}.txt"
  pdf_to_text(pdf_path, output_txt_path)

Preprocessing

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Load NLTK resources
nltk.download('punkt')

def preprocess_text(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

  # Text cleaning
  text = re.sub(r'\n', ' ', text)  # Replace newline characters with space
  text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
  text = text.strip()  # Remove leading and trailing whitespace

  # Sentence segmentation
  sentences = sent_tokenize(text)

  # Tokenization
  tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

  return tokenized_sentences

preprocessed_text = []
# Example usage
for i in range(1,5):

  file_path = f'/content/drive/MyDrive/output{i}.txt'
  preprocessed_text.append(preprocess_text(file_path))

[print(preprocessed_text[i]) for i in range(4)]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[['[', 'Ihr', 'Logo', ']', 'Musterfirma', '—-', 'Musterstraße', '23', '—', '12345', ',', 'Musterhausen', 'Gutschriftsempfänger', 'Straße', ',', 'Hausnummer', 'PLZ', ',', 'Ort', 'Musterfirma', 'AG', 'Musterstraße', ',', '23', '12345', ',', 'Musterhausen', 'Datum', ':', '01.03.2019', 'Gutschrift', 'Nr', '.'], [':', '2019-1004', 'Ihre', 'UmSt', '.'], ['ID', ':', 'DE123456789', 'Gutschriftssdatum', 'entspricht', 'Liefer-/Leistungsdatum', 'Gutschrift', 'Sehr', 'geehrter', 'Herr', 'Schmidt', ',', 'gemäß', 'unserer', 'Vereinbarung', 'schreiben', 'wir', 'Ihnen', 'folgende', 'Leistungen', 'gut', ':', 'Position', 'Anzahl', 'Einheit', 'Bezeichnung', 'Einzelpreis', 'Gesamtpreis', '1', '5', 'Stück', 'Musterleistung', '3,00', '€', '15,00', '€', '2', '3', 'Stück', 'Musterleistung', '5,00', '€', '15,00', '€', 'Nettopreis', '30,00', '€', 'Zzgl', '.'], ['19', '%', 'USt', '.'], ['5,70€', 'Gutschriftbetrag', '35,70', '€', 'Wir', 'überweisen', 'Ihnen', 'den', 'Gutschriftbetrag', 'in', 'den', 'nächsten', 'T

[None, None, None, None]

# **Part 2: LLM-Powered Understanding and Actions**

In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


In [None]:
import openai

openai.api_key = "API_KEY"

text = preprocessed_text

In [None]:
def extract_relationships(text):
    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=f"Extract relationships between entities from the following text make key value pair:\n\n{text}",
        max_tokens=500
    )
    relationships = response.choices[0].text.strip()
    return relationships

relationships = []
for i in range(3):
  relationships.append(extract_relationships(text[i]))

In [None]:
relationships.append(extract_relationships(text[3]))

In [None]:
[print(relationships[i]+'\n') for i in range(4)]

- Company name: Musterfirma
- Address: Musterstraße 23, 12345 Musterhausen
- Invoice number: 2019-1004
- VAT number: DE123456789
- Invoice date: 01.03.2019
- Customer: Herr Schmidt
- Payment method: bank transfer
- Account information: Konto
- Amount: 35,70 €
- Delivery date: 01.03.2019
- Item 1: Musterleistung
- Quantity: 5
- Unit price: 3,00 €
- Total price: 15,00 €
- Item 2: Musterleistung
- Quantity: 3
- Unit price: 5,00 €
- Total price: 15,00 €
- Net price: 30,00 €
- VAT rate: 19%
- VAT amount: 5,70 €
- Bank: Commerzbank
- VAT ID: DE24324567
- IBAN: DE3423 4562 3435 6765
- Register court: Amtsgericht Charlottenburg
- Company email: info@muster.de
- Website: www.firma.de
- Owner/Managing Director: Max Mustermann
- Phone number: +40 (0) 30 12345678
- Title: Inhaber/Geschäftsführer
- Contact email: info@muster.de
- Contact website: www.firma.de

"Billing Information": {'Date': '06/10/2021', 'Invoice Number': 'LS353348', 'Customer Name': 'Donec odio', 'Address': '84746 Buhler St, 3207

[None, None, None, None]

In [None]:
[print(preprocessed_text[i])for i in range(3)]

[['[', 'Ihr', 'Logo', ']', 'Musterfirma', '—-', 'Musterstraße', '23', '—', '12345', ',', 'Musterhausen', 'Gutschriftsempfänger', 'Straße', ',', 'Hausnummer', 'PLZ', ',', 'Ort', 'Musterfirma', 'AG', 'Musterstraße', ',', '23', '12345', ',', 'Musterhausen', 'Datum', ':', '01.03.2019', 'Gutschrift', 'Nr', '.'], [':', '2019-1004', 'Ihre', 'UmSt', '.'], ['ID', ':', 'DE123456789', 'Gutschriftssdatum', 'entspricht', 'Liefer-/Leistungsdatum', 'Gutschrift', 'Sehr', 'geehrter', 'Herr', 'Schmidt', ',', 'gemäß', 'unserer', 'Vereinbarung', 'schreiben', 'wir', 'Ihnen', 'folgende', 'Leistungen', 'gut', ':', 'Position', 'Anzahl', 'Einheit', 'Bezeichnung', 'Einzelpreis', 'Gesamtpreis', '1', '5', 'Stück', 'Musterleistung', '3,00', '€', '15,00', '€', '2', '3', 'Stück', 'Musterleistung', '5,00', '€', '15,00', '€', 'Nettopreis', '30,00', '€', 'Zzgl', '.'], ['19', '%', 'USt', '.'], ['5,70€', 'Gutschriftbetrag', '35,70', '€', 'Wir', 'überweisen', 'Ihnen', 'den', 'Gutschriftbetrag', 'in', 'den', 'nächsten', 'T

[None, None, None]

In [None]:
prev_examples = []
for i in range(4):
  prev_examples.append('prompt\n-----------\n'+str(preprocessed_text[i])+'\n-----------\n\n'+'response\n-----------\n'+relationships[i]+'\n-----------')

In [None]:
print(prev_examples)

["prompt\n-----------\n[['[', 'Ihr', 'Logo', ']', 'Musterfirma', '—-', 'Musterstraße', '23', '—', '12345', ',', 'Musterhausen', 'Gutschriftsempfänger', 'Straße', ',', 'Hausnummer', 'PLZ', ',', 'Ort', 'Musterfirma', 'AG', 'Musterstraße', ',', '23', '12345', ',', 'Musterhausen', 'Datum', ':', '01.03.2019', 'Gutschrift', 'Nr', '.'], [':', '2019-1004', 'Ihre', 'UmSt', '.'], ['ID', ':', 'DE123456789', 'Gutschriftssdatum', 'entspricht', 'Liefer-/Leistungsdatum', 'Gutschrift', 'Sehr', 'geehrter', 'Herr', 'Schmidt', ',', 'gemäß', 'unserer', 'Vereinbarung', 'schreiben', 'wir', 'Ihnen', 'folgende', 'Leistungen', 'gut', ':', 'Position', 'Anzahl', 'Einheit', 'Bezeichnung', 'Einzelpreis', 'Gesamtpreis', '1', '5', 'Stück', 'Musterleistung', '3,00', '€', '15,00', '€', '2', '3', 'Stück', 'Musterleistung', '5,00', '€', '15,00', '€', 'Nettopreis', '30,00', '€', 'Zzgl', '.'], ['19', '%', 'USt', '.'], ['5,70€', 'Gutschriftbetrag', '35,70', '€', 'Wir', 'überweisen', 'Ihnen', 'den', 'Gutschriftbetrag', 'in'

In [None]:
prompt = "A model that takes in text details of document in german(this can be recipt), and responds with a english classification of document(example recipet or certificate) it also translates it and explains in english the response should be in english."
temperature = .4

In [None]:
def generate_system_message(prompt):

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[
          {
            "role": "system",
            "content": "You will be given a high-level description of the model we are training, and from that, you will generate a simple system prompt for that model to use. Remember, you are not generating the system message for data generation -- you are generating the system message to use for inference. A good format to follow is `Given $INPUT_DATA, you will $WHAT_THE_MODEL_SHOULD_DO.`.\n\nMake it as concise as possible. Include nothing but the system prompt in your response.\n\nFor example, never write: `\"$SYSTEM_PROMPT_HERE\"`.\n\nIt should be like: `$SYSTEM_PROMPT_HERE`."
          },
          {
              "role": "user",
              "content": prompt.strip(),
          }
        ],
        temperature=temperature,
        max_tokens=500,
    )

    return response.choices[0].message['content']

system_message = generate_system_message(prompt)

print(f'The system message is: `{system_message}`. Feel free to re-run this cell if you want a better result.')

The system message is: `Given a text document in German, you will classify the document (e.g., receipt or certificate) in English, translate it, and provide an English explanation.`. Feel free to re-run this cell if you want a better result.


In [None]:
import pandas as pd

# Initialize lists to store prompts and responses
prompts = []
responses = []

# Parse out prompts and responses from examples
for example in prev_examples:
  try:
    split_example = example.split('-----------')
    prompts.append(split_example[1].strip())
    responses.append(split_example[3].strip())
  except:
    pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')

df.head()

There are 4 successfully-generated examples. Here are the first few:


Unnamed: 0,prompt,response
0,"[['[', 'Ihr', 'Logo', ']', 'Musterfirma', '—-'...",- Company name: Musterfirma\n- Address: Muster...
1,"[['06/10/2021', '.'], ['LS353348', ')', 'kmey'...","""Billing Information"": {'Date': '06/10/2021', ..."
2,"[['Firmenlogo', 'Max', 'Mustermann', '-', 'Mus...","{\n 'Name': 'Max Mustermann',\n 'Address..."
3,"[['Kraxi', 'GmbH', 'Flugzeugallee', '17', '123...",1. Entity 1: Kraxi GmbH\n Entity 2: Flugzeug...


In [None]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json('train.jsonl', orient='records', lines=True)
test_df.to_json('test.jsonl', orient='records', lines=True)

In [None]:
system_message = "Given a text document in German, you will classify the document (e.g., receipt or certificate) in English, translate it, and provide an English explanation."

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
model_name = "NousResearch/llama-2-7b-chat-hf"  # or "meta-llama/Llama-2-7b-chat-hf" with Hugging Face key
dataset_name = "/content/drive/MyDrive/train.jsonl"
new_model = "llama-2-7b-custom"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False
device_map = {"":0}

In [None]:
train_dataset = load_dataset('json', data_files={'train': dataset_name})
valid_dataset = load_dataset('json', data_files={'validation': dataset_name})

# Preprocess datasets
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="all",
    evaluation_strategy="steps",
    eval_steps=5  # Evaluate every 5 steps
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped['train'],
    eval_dataset=valid_dataset_mapped['validation'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train the model
trainer.train()
trainer.model.save_pretrained(new_model)

# Test the model
logging.set_verbosity(logging.CRITICAL)
prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n ich bin harsh . [/INST]"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss




[INST] <<SYS>>
Given a text document in German, you will classify the document (e.g., receipt or certificate) in English, translate it, and provide an English explanation.
<</SYS>>

 ich bin harsh . [/INST]  Based on the text you provided, I can classify the document as a receipt. Here is the English translation:

"I am harsh"

Explanation: The text "ich bin harsh" is a German sentence that translates to "I am harsh" in English. It is a simple statement expressing a personal quality or trait.


In [None]:
from transformers import pipeline

prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n[['[', 'Ihr', 'Logo', ']', 'Musterfirma', '—-', 'Musterstraße', '23', '—', '12345', ',', 'Musterhausen', 'Gutschriftsempfänger', 'Straße', ',', 'Hausnummer', 'PLZ', ',', 'Ort', 'Musterfirma', 'AG', 'Musterstraße', ',', '23', '12345', ',', 'Musterhausen', 'Datum', ':', '01.03.2019', 'Gutschrift', 'Nr', '.'], [':', '2019-1004', 'Ihre', 'UmSt', '.'], ['ID', ':', 'DE123456789', 'Gutschriftssdatum', 'entspricht', 'Liefer-/Leistungsdatum', 'Gutschrift', 'Sehr', 'geehrter', 'Herr', 'Schmidt', ',', 'gemäß', 'unserer', 'Vereinbarung', 'schreiben', 'wir', 'Ihnen', 'folgende', 'Leistungen', 'gut', ':', 'Position', 'Anzahl', 'Einheit', 'Bezeichnung', 'Einzelpreis', 'Gesamtpreis', '1', '5', 'Stück', 'Musterleistung', '3,00', '€', '15,00', '€', '2', '3', 'Stück', 'Musterleistung', '5,00', '€', '15,00', '€', 'Nettopreis', '30,00', '€', 'Zzgl', '.'], ['19', '%', 'USt', '.'], ['5,70€', 'Gutschriftbetrag', '35,70', '€', 'Wir', 'überweisen', 'Ihnen', 'den', 'Gutschriftbetrag', 'in', 'den', 'nächsten', 'Tagen', 'auf', 'Ihr', 'Konto', '.'], ['Mit', 'freundlichen', 'Grüßen', 'Max', 'Mustermann', 'Musterfirma', 'GmbH', 'Kreditinstitut', ':', 'Commerzbank', 'USt-ID', ':', 'DE24324567', 'Musterstraße', ',', '23', 'IBAN', ':', 'DE3423', '4562', '3435', '6765', 'HRB', ':', '1234567B', '12345', ',', 'Musterhausen', 'BIC', ':', 'COBADEFFXXX', 'Amtsgericht', ':', 'Charlottenburg', 'Tel', ':', '+40', '(', '0', ')', '30', '12345678', 'Kto', '.'], ['Inh', '.'], [':', 'Max', 'Mustermann', 'Geschäftsführer', ':', 'Max', 'Mustermann', 'E-Mail', ':', 'info', '@', 'muster.de', 'Webseite', ':', 'www.firma.de']]. [/INST]" # replace the command here with something relevant to your task
num_new_tokens = 1000  # change to the number of new tokens you want to generate

# Count the number of tokens in the prompt
num_prompt_tokens = len(tokenizer(prompt)['input_ids'])

# Calculate the maximum length for the generation
max_length = num_prompt_tokens + num_new_tokens

gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=max_length)
result = gen(prompt)
print(result[0]['generated_text'].replace(prompt, ''))

  Based on the provided text document in German, I can classify it as a receipt or certificate. The document appears to be a Gutschrift, which is a type of invoice or receipt in Germany.

The document includes various details such as the recipient's name and address, the date, and the amount of the Gutschrift. It also includes information about the goods or services provided, the prices, and the total amount due.

Here is a rough translation of the document into English:

"Dear Sir/Madam,

We are pleased to inform you that we have issued a Gutschrift for the following goods/services:

* Position: 3 Stück
* Anzahl: 5 Stück
* Einheit: Musterleistung
* Bezeichnung: Musterleistung
* Einzelpreis: 3,00 €
* Gesamtpreis: 15,00 €
* Nettopreis: 30,00 €

The Gutschrift amount is 5,70 €. We will transfer the Gutschrift amount to your account in the next few days.

Please find attached the Gutschrift for your reference.

Sincerely,
Max Mustermann
Musterfirma GmbH & Co. KG
Commerzbank USt-ID: DE2432

In [None]:
# Function to free GPU memory
def free_memory():
    import gc
    gc.collect()
    torch.cuda.empty_cache()
free_memory()

In [None]:
# Merge and save the fine-tuned model

model_path = "/content/llama-2-7b-custom"  # change to your preferred path

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu",
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Save the merged model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('/content/llama-2-7b-custom/tokenizer_config.json',
 '/content/llama-2-7b-custom/special_tokens_map.json',
 '/content/llama-2-7b-custom/tokenizer.model',
 '/content/llama-2-7b-custom/added_tokens.json',
 '/content/llama-2-7b-custom/tokenizer.json')