## Importing Libraries

In [392]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import logging
import time
import torch
import transformers
import pandas as pd
from datetime import date, datetime

from transformers import AutoTokenizer
from transformers import TFAutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import TFBertLMHeadModel

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

In [393]:
# test whether torch is working
device = torch.device("cuda")
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070 SUPER'


#### If you want to download a model or a tokenizer from Huggingface Hub:

In [225]:
# Logging into HuggingFace account
from huggingface_hub import login
login()
# my_token:  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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Model and Tokenizer declaration

In [363]:
# model_name = "bert-base-uncased"
model_name = "EleutherAI/pythia-70m"

In [364]:
# model = TFBertLMHeadModel.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [365]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [366]:
tokenizer.pad_token = tokenizer.eos_token    # for Eleuther model use this
# tokenizer.pad_token = tokenizer.cls_token    # for Bert model use this

## Preparing the dataset for Fine-tuning

Provide a Json Lines (Jsonl) file containing question and answer pairs; like this:

{"question": "what could be a use case of LLM for a hospital", "answer": "LLMs can have several applications in a hospital such as Assisstant for nurses"}

{"question": "How AI is being used in medical industries?", "answer":" AI is being used in medical industries for various purposes such as disease diagnose"}

In [367]:
# dataset_path = "ds.jsonl"
dataset_path = "ds-sat.jsonl"

In [368]:
# dataset_df = pd.read_json("ds.jsonl", lines=True)
# dict_dataset = df.to_dict()
# print("dataset contains " + str(len(dict_dataset['question'])) + " Q & A")
# dict_dataset

In [369]:
# padding and truncating mey be used if model input exceeds max_length
max_length = 2048

In [370]:
def tokenize_function(dict_dataset):
    if "question" in dict_dataset and "answer" in dict_dataset:
      text = dict_dataset["question"][0] + dict_dataset["answer"][0]
    elif "input" in dict_dataset and "output" in dict_dataset:
      text = dict_dataset["input"][0] + dict_dataset["output"][0]
    else:
      text = dict_dataset["text"][0]

    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [371]:
finetuning_dataset_loaded["question"][0]

'Why is satellite imagery a valuable source of data for assessing maritime traffic??'

In [372]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=dataset_path, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 45
})


In [373]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [374]:
tokenized_dataset[0]

{'question': 'Why is satellite imagery a valuable source of data for assessing maritime traffic??',
 'answer': 'Satellite imagery is a valuable source of data for assessing maritime traffic because it allows access to information that is difficult to obtain by other means. Satellites provide a global geographic coverage and collect information periodically with high temporal resolution. This means that satellite images can provide a comprehensive view of maritime traffic in a given region over time.',
 'input_ids': [4967,
  310,
  15109,
  27471,
  247,
  9865,
  2603,
  273,
  941,
  323,
  18005,
  37223,
  7137,
  8220,
  20794,
  29873,
  27471,
  310,
  247,
  9865,
  2603,
  273,
  941,
  323,
  18005,
  37223,
  7137,
  984,
  352,
  4483,
  2289,
  281,
  1491,
  326,
  310,
  2834,
  281,
  4044,
  407,
  643,
  2097,
  15,
  11191,
  437,
  3254,
  2085,
  247,
  4156,
  23365,
  7031,
  285,
  4822,
  1491,
  28557,
  342,
  1029,
  11935,
  6064,
  15,
  831,
  2097,
  326,

In [375]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=14)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 40
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})


In [376]:
train_ds = split_dataset["train"]
test_ds = split_dataset["test"]
print("Training Dataset:\n", train_ds)
print("Testing Dataset:\n", test_ds)

Training Dataset:
 Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 40
})
Testing Dataset:
 Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})


### PyTorch Model:

In [377]:
# Trainer class to include logging and history
class Trainer(transformers.Trainer):
    def __init__(
        self,
        model,
        model_flops,
        total_steps,
        args=None,
        data_collator=None,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=None,
        model_init=None,
        compute_metrics=None,
        callbacks=None,
        optimizers=(None, None),
    ):
        super(Trainer, self).__init__(
            model,
            args,
            data_collator,
            train_dataset,
            eval_dataset,
            tokenizer,
            model_init,
            compute_metrics,
            callbacks,
            optimizers,
        )

        self.total_steps = total_steps
        self.model_flops = model_flops
        self.start_step = 0

    def training_step(self, model, inputs):
        if inputs["input_ids"].numel() == 0:

          print("Inputs: ", inputs)
          print("Inputs - input_ids", inputs["input_ids"])
          print("numel", inputs["input_ids"].numel())

          return torch.tensor(0)
        else:
          model.train()
          inputs = self._prepare_inputs(inputs)

          with self.compute_loss_context_manager():
              loss = self.compute_loss(model, inputs)

          if self.args.n_gpu > 1:
              loss = loss.mean()  # mean() to average on multi-gpu parallel training

          if self.do_grad_scaling:
              self.scaler.scale(loss).backward()
          else:
              self.accelerator.backward(loss)

          return loss.detach() / self.args.gradient_accumulation_steps

    def log(self, logs):
        """
        Log `logs` on the various objects watching training.
        Subclass and override this method to inject custom behavior.
        Args:
            logs (`Dict[str, float]`):
                The values to log.
        """
        if self.state.epoch is not None:
            logs["epoch"] = round(self.state.epoch, 2)

        self.update_log_timing(logs)

        output = {**logs, **{"step": self.state.global_step}}
        self.update_history(output)

        logger.debug("Step (" + str(self.state.global_step) + ") Logs: " + str(logs))
        self.control = self.callback_handler.on_log(
            self.args, self.state, self.control, logs
        )

    def update_log_timing(self, logs):
        if len(self.state.log_history) == 0:
            self.start_time = time.time()
            logs["iter_time"] = 0.0
            logs["flops"] = 0.0
            logs["remaining_time"] = 0.0
            self.start_step = self.state.global_step
        elif self.state.global_step > self.start_step:
            logs["iter_time"] = (time.time() - self.start_time) / (
                self.state.global_step - self.start_step
            )
            logs["flops"] = self.model_flops / logs["iter_time"]
            logs["remaining_time"] = (self.total_steps - self.state.global_step) * logs[
                "iter_time"
            ]

    def update_history(self, output):
        if "eval_loss" in output:
            return
        if len(self.state.log_history) > 0:
            smoothing_window = 100
            p = 1.0 / smoothing_window
            if "loss" in output:
                output["loss"] = output["loss"] * p + self.state.log_history[-1][
                    "loss"
                ] * (1.0 - p)
        self.state.log_history.append(output)

In [378]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [379]:
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [380]:
test_text = test_ds[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_ds[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): What is the significance of ship detection with satellite imagery?
Correct answer from Lamini docs: Ship detection with satellite imagery has various applications, such as maritime surveillance, vessel traffic management, illegal fishing detection, and monitoring trade activities.
Model's answer: 


The answer is that the satellite imagery is not a satellite imagery, but a satellite imagery. The satellite imagery is a satellite imagery of the moon, the moon, the moon, and the moon. The satellite imagery is a satellite imagery of the moon, the moon, and the moon. The satellite imagery is a satellite imagery of the moon, the moon, and the moon. The satellite imagery is a satellite imagery of the moon, the


#### Setup training

In [386]:
max_steps = 1000

In [387]:
# trained_model_name = f"{model_name}_{max_steps}_steps_{date.today()}--{datetime.now().strftime('%H;%M')}"
trained_model_name = f"{model_name}_{max_steps}_steps_Satellite-{date.today()}--{datetime.now().strftime('%H;%M')}"
output_dir = trained_model_name

In [388]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=2.0e-5,

  # Number of training epochs
  num_train_epochs=30,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [389]:
model_flops = (
  model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, max_length)
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(model)
print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [394]:
trainer = Trainer(
    model=model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)
logger = logging.getLogger(__name__)

In [395]:
training_output = trainer.train()

Step,Training Loss,Validation Loss,Time,Unnamed: 4
120,1.013942,4.997031,352.875124,5475556488160.919
240,0.381349,5.724802,300.313096,5556559329888.557
360,0.174673,6.009246,252.835046,5557882182907.407
480,0.105261,6.208739,204.001213,5596767033568.611
600,0.081003,6.297903,155.939001,5632119732029.252
720,0.071567,6.391783,108.683712,5656661681397.458
840,0.066342,6.495336,61.972798,5668726659740.709
960,0.062209,6.540971,15.458036,5681621531441.122


In [403]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: EleutherAI/pythia-70m_1000_steps_Satellite-2023-08-30--15;27/final


In [404]:
finetuned_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_model.to(device) 

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [407]:
test_instance = 3
test_question = test_ds[test_instance]['question']
test_answer = test_ds[test_instance]['answer']
print("Question input (test):", test_question)
print("\nAnswer from dataset (test):\n", test_answer)
print("\nBase model's answer:\n",inference(test_question, base_model, tokenizer))
print("\nFinetuned model's answer: ")
print(inference(test_question, finetuned_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): What are some algorithms commonly used for sentiment segmentation of images?

Answer from dataset (test):
 Some common algorithms used for sentiment segmentation of images include Mask R-CNN, DeepLab, and U-Net. These algorithms leverage deep learning and convolutional neural network techniques to analyze and classify different regions of an image based on their sentiment or emotional content.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



Base model's answer:
 

A:

I think you should use a simple algorithm to get the best results.
The algorithm is called the "best" algorithm.
The algorithm is called the "best" algorithm.
The algorithm is called the "best" algorithm.
The algorithm is called the "best" algorithm.
The algorithm is called the "best" algorithm.
The algorithm is called the "best" algorithm.
The

Finetuned model's answer: 
Various algorithms can be used, including convolutional neural networks (CNNs), support vector machines (SVMs), random forests, or ensemble models, depending on the specific requirements and characteristics of the applied task. These algorithms can be used in various contexts, including sentiment segmentation, sentiment detection, and classification tasks, depending on the specific requirements and characteristics of the task. Additionally, some algorithms can perform better in some contexts, such as object detection


In [406]:
query = """How are machine learning models trained for ship detection?"""
print(inference(train_ds[5]['question'], model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Ship detection using satellite imagery is a process of automatically identifying and locating ships in satellite images using machine learning models and AI algorithms. It involves using machine learning models and AI algorithms to identify and locate ships in satellite images. This process can be challenging for ship detection because of the complexity of ship detection using satellite imagery. Additionally, ship detection using satellite imagery is not feasible for ship detection because of the low performance of SAR detection models. Additionally, ship detection using satellite


### Tensorflow Model:

In [None]:
# It may need case-wise considerations:
if model_name == "bert-base-uncased":
    model = TFBertLMHeadModel.from_pretrained("bert-base-uncased")
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_name)

In [None]:
model.compile(optimizer=Adam(3e-5))

In [None]:
model.fit(tokenized_data, labels)

In [None]:
model.predict()