## Importing Libraries

In [2]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import logging
import time
import torch
import transformers
import pandas as pd
from datetime import date, datetime

from transformers import AutoTokenizer
from transformers import TFAutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import TFBertLMHeadModel

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

In [83]:
# test whether torch is working
device = torch.device("cuda")
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070 SUPER'


#### If you want to download a model or a tokenizer from Huggingface Hub:

In [None]:
# Logging into HuggingFace account
from huggingface_hub import login
login()
# my_token:  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

## Model and Tokenizer declaration

In [68]:
# model_name = "bert-base-uncased"
model_name = "EleutherAI/pythia-70m"

In [69]:
# model = TFBertLMHeadModel.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [70]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [71]:
tokenizer.pad_token = tokenizer.eos_token    # for Eleuther model use this
# tokenizer.pad_token = tokenizer.cls_token    # for Bert model use this

## Preparing the dataset for Fine-tuning

Provide a Json Lines (Jsonl) file containing question and answer pairs; like this:

{"question": "what could be a use case of LLM for a hospital", "answer": "LLMs can have several applications in a hospital such as Assisstant for nurses"}

{"question": "How AI is being used in medical industries?", "answer":" AI is being used in medical industries for various purposes such as disease diagnose"}

In [72]:
dataset_path = "ds.jsonl"

In [73]:
# dataset_df = pd.read_json("ds.jsonl", lines=True)
# dict_dataset = df.to_dict()
# print("dataset contains " + str(len(dict_dataset['question'])) + " Q & A")
# dict_dataset

In [74]:
# padding and truncating mey be used if model input exceeds max_length
max_length = 2048

In [75]:
def tokenize_function(dict_dataset):
    if "question" in dict_dataset and "answer" in dict_dataset:
      text = dict_dataset["question"][0] + dict_dataset["answer"][0]
    elif "input" in dict_dataset and "output" in dict_dataset:
      text = dict_dataset["input"][0] + dict_dataset["output"][0]
    else:
      text = dict_dataset["text"][0]

    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [76]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=dataset_path, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [77]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [78]:
tokenized_dataset[0]

{'question': "What are the different types of documents available in the repository (e.g., installation guide, API documentation, developer's guide)?",
 'answer': 'Lamini has documentation on Getting Started, Authentication, Question Answer Model, Python Library, Batching, Error Handling, Advanced topics, and class documentation on LLM Engine available at https://lamini-ai.github.io/.',
 'input_ids': [1276,
  403,
  253,
  1027,
  3510,
  273,
  7177,
  2130,
  275,
  253,
  18491,
  313,
  70,
  15,
  72,
  904,
  12692,
  7102,
  13,
  8990,
  10097,
  13,
  13722,
  434,
  7102,
  6177,
  45,
  4988,
  74,
  556,
  10097,
  327,
  27669,
  11075,
  264,
  13,
  5271,
  23058,
  13,
  19782,
  37741,
  10031,
  13,
  13814,
  11397,
  13,
  378,
  16464,
  13,
  11759,
  10535,
  1981,
  13,
  21798,
  12989,
  13,
  285,
  966,
  10097,
  327,
  21708,
  46,
  10797,
  2130,
  387,
  5987,
  1358,
  77,
  4988,
  74,
  14,
  2284,
  15,
  7280,
  15,
  900,
  14206],
 'attention_mas

In [79]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [80]:
train_ds = split_dataset["train"]
test_ds = split_dataset["test"]
print("Training Dataset:\n", train_ds)
print("Testing Dataset:\n", test_ds)

Training Dataset:
 Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})
Testing Dataset:
 Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 140
})


### PyTorch Model:

In [109]:
# Trainer class to include logging and history
class Trainer(transformers.Trainer):
    def __init__(
        self,
        model,
        model_flops,
        total_steps,
        args=None,
        data_collator=None,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=None,
        model_init=None,
        compute_metrics=None,
        callbacks=None,
        optimizers=(None, None),
    ):
        super(Trainer, self).__init__(
            model,
            args,
            data_collator,
            train_dataset,
            eval_dataset,
            tokenizer,
            model_init,
            compute_metrics,
            callbacks,
            optimizers,
        )

        self.total_steps = total_steps
        self.model_flops = model_flops
        self.start_step = 0

    def training_step(self, model, inputs):
        if inputs["input_ids"].numel() == 0:

          print("Inputs: ", inputs)
          print("Inputs - input_ids", inputs["input_ids"])
          print("numel", inputs["input_ids"].numel())

          return torch.tensor(0)
        else:
          model.train()
          inputs = self._prepare_inputs(inputs)

          with self.compute_loss_context_manager():
              loss = self.compute_loss(model, inputs)

          if self.args.n_gpu > 1:
              loss = loss.mean()  # mean() to average on multi-gpu parallel training

          if self.do_grad_scaling:
              self.scaler.scale(loss).backward()
          else:
              self.accelerator.backward(loss)

          return loss.detach() / self.args.gradient_accumulation_steps

    def log(self, logs):
        """
        Log `logs` on the various objects watching training.
        Subclass and override this method to inject custom behavior.
        Args:
            logs (`Dict[str, float]`):
                The values to log.
        """
        if self.state.epoch is not None:
            logs["epoch"] = round(self.state.epoch, 2)

        self.update_log_timing(logs)

        output = {**logs, **{"step": self.state.global_step}}
        self.update_history(output)

        logger.debug("Step (" + str(self.state.global_step) + ") Logs: " + str(logs))
        self.control = self.callback_handler.on_log(
            self.args, self.state, self.control, logs
        )

    def update_log_timing(self, logs):
        if len(self.state.log_history) == 0:
            self.start_time = time.time()
            logs["iter_time"] = 0.0
            logs["flops"] = 0.0
            logs["remaining_time"] = 0.0
            self.start_step = self.state.global_step
        elif self.state.global_step > self.start_step:
            logs["iter_time"] = (time.time() - self.start_time) / (
                self.state.global_step - self.start_step
            )
            logs["flops"] = self.model_flops / logs["iter_time"]
            logs["remaining_time"] = (self.total_steps - self.state.global_step) * logs[
                "iter_time"
            ]

    def update_history(self, output):
        if "eval_loss" in output:
            return
        if len(self.state.log_history) > 0:
            smoothing_window = 100
            p = 1.0 / smoothing_window
            if "loss" in output:
                output["loss"] = output["loss"] * p + self.state.log_history[-1][
                    "loss"
                ] * (1.0 - p)
        self.state.log_history.append(output)

In [110]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [111]:
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [112]:
test_text = test_ds[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_ds[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Can Lamini generate technical documentation or user manuals for software projects?
Correct answer from Lamini docs: Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.
Model's answer: 


I have a question about the following:

How do I get the correct documentation to work?

A:

I think you need to use the following code:

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following


#### Setup training

In [113]:
max_steps = 1000

In [114]:
trained_model_name = f"{model_name}_{max_steps}_steps_{date.today()}--{datetime.now().strftime('%H;%M')}"
output_dir = trained_model_name

In [115]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=20,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [116]:
model_flops = (
  model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, max_length)
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(model)
print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [119]:
trainer = Trainer(
    model=model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)
logger = logging.getLogger(__name__)

In [120]:
training_output = trainer.train()

Step,Training Loss,Validation Loss,Time,Unnamed: 4
120,2.785011,2.336461,142.115101,13595934958324.592
240,2.432526,2.198434,125.020408,13347481141061.871
360,2.164942,2.149117,106.818674,13155259694826.115
480,1.983753,2.108783,87.745057,13012097808168.86
600,1.899057,2.08544,68.563866,12809474871045.729
720,1.77509,2.083733,47.537109,12932780348357.844
840,1.708021,2.063934,27.176818,12926710406766.826
960,1.633429,2.059559,6.868385,12787098297236.975


Inputs:  {'input_ids': tensor([], device='cuda:0', size=(1, 0)), 'attention_mask': tensor([], device='cuda:0', size=(1, 0)), 'labels': tensor([], device='cuda:0', size=(1, 0))}
Inputs - input_ids tensor([], device='cuda:0', size=(1, 0))
numel 0
Inputs:  {'input_ids': tensor([], device='cuda:0', size=(1, 0)), 'attention_mask': tensor([], device='cuda:0', size=(1, 0)), 'labels': tensor([], device='cuda:0', size=(1, 0))}
Inputs - input_ids tensor([], device='cuda:0', size=(1, 0))
numel 0
Inputs:  {'input_ids': tensor([], device='cuda:0', size=(1, 0)), 'attention_mask': tensor([], device='cuda:0', size=(1, 0)), 'labels': tensor([], device='cuda:0', size=(1, 0))}
Inputs - input_ids tensor([], device='cuda:0', size=(1, 0))
numel 0


In [121]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: EleutherAI/pythia-70m_1000_steps_2023-08-30--08;16/final


In [123]:
finetuned_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_model.to(device) 

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [125]:
test_question = test_ds[0]['question']
print("Question input (test):", test_question)

print("Finetuned model's answer: ")
print(inference(test_question, finetuned_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Can Lamini generate technical documentation or user manuals for software projects?
Finetuned model's answer: 
Yes, Lamini can generate technical documentation or user manuals for software projects. Lamini can generate technical documentation or user manuals for software projects. It can also generate documentation or user-provided documentation for software projects. It can also generate documentation for software projects. It can also generate documentation for software projects. It can also generate documentation for software projects. It can also generate documentation for software projects. It can also


### Tensorflow Model:

In [134]:
# It may need case-wise considerations:
if model_name == "bert-base-uncased":
    model = TFBertLMHeadModel.from_pretrained("bert-base-uncased")
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_name)

If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`
All PyTorch model weights were used when initializing TFBertLMHeadModel.

All the weights of TFBertLMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertLMHeadModel for predictions without further training.


In [135]:
model.compile(optimizer=Adam(3e-5))

In [None]:
model.fit(tokenized_data, labels)

In [None]:
model.predict()