## Importing Libraries

In [152]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import logging
import time
import torch
import transformers
import pandas as pd
from datetime import date, datetime

from transformers import AutoTokenizer
from transformers import TFAutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import TFBertLMHeadModel

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

In [3]:
# test whether torch is working
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070 SUPER'

In [132]:
model_name = "bert-base-uncased"

## Preparing the dataset for Fine-tuning

Provide a Json Lines (Jsonl) file containing question and answer pairs; like this:

{"question": "what could be a use case of LLM for a hospital", "answer": "LLMs can have several applications in a hospital such as Assisstant for nurses"}

{"question": "How AI is being used in medical industries?", "answer":" AI is being used in medical industries for various purposes such as disease diagnose"}

In [113]:
dataset_path = "ds.jsonl"

In [114]:
# dataset_df = pd.read_json("ds.jsonl", lines=True)
# dict_dataset = df.to_dict()
# print("dataset contains " + str(len(dict_dataset['question'])) + " Q & A")
# dict_dataset

In [115]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

max_length = 2048
# max_length = min(
#     tokenized_inputs["input_ids"].shape[1],
#     max_length,
# )
max_length

2048

In [116]:
def tokenize_function(dict_dataset):
    if "question" in dict_dataset and "answer" in dict_dataset:
      text = dict_dataset["question"][0] + dict_dataset["answer"][0]
    elif "input" in dict_dataset and "output" in dict_dataset:
      text = dict_dataset["input"][0] + dict_dataset["output"][0]
    else:
      text = dict_dataset["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [117]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=dataset_path, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [118]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [119]:
tokenized_dataset[0]

{'question': "What are the different types of documents available in the repository (e.g., installation guide, API documentation, developer's guide)?",
 'answer': 'Lamini has documentation on Getting Started, Authentication, Question Answer Model, Python Library, Batching, Error Handling, Advanced topics, and class documentation on LLM Engine available at https://lamini-ai.github.io/.',
 'input_ids': [1276,
  403,
  253,
  1027,
  3510,
  273,
  7177,
  2130,
  275,
  253,
  18491,
  313,
  70,
  15,
  72,
  904,
  12692,
  7102,
  13,
  8990,
  10097,
  13,
  13722,
  434,
  7102,
  6177,
  45,
  4988,
  74,
  556,
  10097,
  327,
  27669,
  11075,
  264,
  13,
  5271,
  23058,
  13,
  19782,
  37741,
  10031,
  13,
  13814,
  11397,
  13,
  378,
  16464,
  13,
  11759,
  10535,
  1981,
  13,
  21798,
  12989,
  13,
  285,
  966,
  10097,
  327,
  21708,
  46,
  10797,
  2130,
  387,
  5987,
  1358,
  77,
  4988,
  74,
  14,
  2284,
  15,
  7280,
  15,
  900,
  14206],
 'attention_mas

In [121]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [127]:
train_ds = split_dataset["train"]
test_ds = split_dataset["test"]
print("Training Dataset:\n", train_ds)
print("Testing Dataset:\n", test_ds)

Training Dataset:
 Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})
Testing Dataset:
 Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 140
})



## Set up the model, training config, and tokenizer

In [137]:
# Logging into HuggingFace account
from huggingface_hub import login
login()
# my_token:  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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### PyTorch Model:

In [136]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

#### Setup training

In [138]:
max_steps = 10000

In [139]:
trained_model_name = f"{model_name}_{max_steps}_steps_{date.today()}-{datetime.now().strftime('%H:%M')}"
output_dir = trained_model_name

### Tensorflow Model:

In [134]:
# It may need case-wise considerations:
if model_name == "bert-base-uncased":
    model = TFBertLMHeadModel.from_pretrained("bert-base-uncased")
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_name)

If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`
All PyTorch model weights were used when initializing TFBertLMHeadModel.

All the weights of TFBertLMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertLMHeadModel for predictions without further training.


In [135]:
model.compile(optimizer=Adam(3e-5))

In [None]:
model.fit(tokenized_data, labels)

In [None]:
model.predict()