In [52]:
import os
import pickle
from typing import List
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from transformers import pipeline
from tqdm.autonotebook import tqdm
from sklearn import metrics
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [53]:
config  = {
    "model":{
        "max_seq_len":2048,
        "dtype":None,
        "load_in_4bit":True,
        },
    "model_save":"saved_model",
    "seed":49,
}

In [54]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config["model_save"], # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = config["model"]["max_seq_len"],
    dtype = config["model"]["dtype"],
    load_in_4bit = config["model"]["load_in_4bit"],
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.43.4.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.679 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

In [55]:
dataset = load_dataset(
    "json",
    data_files={
        "valid":"final_data/valid.json",
        "test":"final_data/test.json",
    }
)

In [56]:
model.config

LlamaConfig {
  "_name_or_path": "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

In [57]:
dataset["test"][0]["text"], dataset["test"][4]["outputs"]

('### Instructions: Answer the following question about miRNA protein interactions. \n### Context: MicroRNAs (miRNAs) are, small non-coding RNAs with 18–25 nucleotides, which are central regulators at the \npost-transcriptional level in both animals and plants. Perfect or near-perfect complementary binding of miRNAs and their \ntarget mRNA negatively regulates gene expression by accelerating mRNA degradation or suppressing mRNA translation.\n### Question: Given the miRNA mature sequence and target amino acid sequence, predict whether \n(A) the miRNA and target do not interact (B) the miRNA and target interact\nmiRNA sequence: GGAUGGAGGAGGGGUCU\nTarget amino acid sequence: MGCRCCKIIQSYLFDPVQVPSPGYVNEVNSCKLDEDDTDKLKGKWSSEVLVQKNDPQRQGSKKTESSSRTADPWEPCWPHQGPLPQGDAGGEHHACGVNGIGPAATPQPTGNSSPTQDDRGSWASTANTVPPTQPFLEGGGTRKQDCVLLASEGTQVMRNGDSRAPSEAESFALEVQDHVFQIPAPDYLQHWGPAGDNVDHNEKDCVFKNHTEDESLEGIQPPVGEHGLNTPFSVRRSWDSLNEDVETEVLSICFNEKGPVHAMPVVDSGNRQEDTHGSDGDGDGEIVDEDAAVAEALAALEAATAGEDLDETD\n###

In [58]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer([
    "Please tell me the first 10 finbanoci numbers",
    ], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens =10, use_cache = True)
o = tokenizer.batch_decode(outputs)

In [59]:
def manual_model_response(datapoint, model, tokenizer):
    inputs = tokenizer([
    datapoint.strip(),
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 3, use_cache = True)
    return tokenizer.batch_decode(outputs)


def unnest_dictionary(d:dict, level:str=None, unnested_dict:dict = {}) -> dict:
    for k, v in d.items():
        if level:
            next_level=f"{level}_{k}"
        else:
            next_level = k
            
        if not isinstance(v,dict):
            #print(level,k)
            unnested_dict[next_level] = v
        else:
            #print(d,level,k,v,2)
            unnest_dictionary(v, next_level, unnested_dict)
            
    return unnested_dict

def evalution_categorical(y_pred:np.array, y:np.array) -> dict:
    """
    Contains the relevant metrics for multilabel Binary Classification

    Args:
        y_pred (np.array):model output of logit layer 
        y (np.array): onehot encoded target 
    """
    
    #macro_avg
    
    y = y.flatten()
    
    met = metrics.classification_report(
        y_pred=y_pred,
        y_true=y,
        output_dict=True
    )
    
    print(metrics.classification_report(
        y_pred=y_pred,
        y_true=y,
    ))
    
    met = unnest_dictionary(met,unnested_dict={})
    return met

def generate_categorical_labels(data, pipe, metadata) -> List[np.array]:
    def map_cls(x):
        try:
            return metadata["cls_map"][x]
        except KeyError:
            return metadata["num_cls"]
    
    labels = []
    for i,text in tqdm(enumerate(data["text"]), total=len(data["text"])):
        result = pipe(text)#(text, model,[tokenizer)
        labels.append(
            result[0]["generated_text"][-3:]
        )
    
    print(np.unique(labels, return_counts=True))
    labels = list(map(map_cls,labels))
    labels = np.array(labels)
    return labels, np.array(data["outputs"])

def select_datapoints(data, ds_name):
    return data

def dataset_evaluations(data, model, tokenizer, label_map):
    pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=512, 
                        temperature=0.01)
    
    for ds_name, ds_metadata in label_map.items():
        print(f"_"*30)
        print(f"### {ds_name}")
        _data = select_datapoints(data,ds_name)
        y_pred, y_true = generate_categorical_labels(_data, pipe, ds_metadata)
        
        _ = evalution_categorical(y_pred, y_true)
        print(f"_"*30, end="\n\n")

dataset_response_map = {
    'MTI_miRTarBase':{
        "cls_map" : {
            "(A)":0,
            "(B)":1,
        },
        "num_cls":2,
    }
}

"""

def generate_categorical_labels(data, model, tokenizer) -> np.array:
    labels = []
    text = data["text"]
    for i,d in enumerate(tqdm(pipe(text))):
        print(d)
        labels.append(
            d[0]["generated_text"][-1:]
        ) 
        
    #labels = np.array(labels).astype(int)
    return labels


"""

'\n\ndef generate_categorical_labels(data, model, tokenizer) -> np.array:\n    labels = []\n    text = data["text"]\n    for i,d in enumerate(tqdm(pipe(text))):\n        print(d)\n        labels.append(\n            d[0]["generated_text"][-1:]\n        ) \n        \n    #labels = np.array(labels).astype(int)\n    return labels\n\n\n'

In [60]:
np.unique(dataset["test"]["ds_ident"])

array(['MTI_miRTarBase'], dtype='<U14')

In [51]:
dataset_evaluations(dataset["test"], model, tokenizer, dataset_response_map)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM'

______________________________
### MTI_miRTarBase


  0%|          | 0/13820 [00:00<?, ?it/s]

In [12]:
labels = generate_categorical_labels(dataset["test"], model, tokenizer)

  0%|          | 0/1000 [00:00<?, ?it/s]

AttributeError: 'str' object has no attribute 'shape'

In [31]:
np.unique(labels, return_counts=True)

(array(['/ \n', '/ (', '>#(', '>A>', 'bsp', 'cho', 'rug', 'ude', 'www'],
       dtype='<U3'),
 array([  1, 227,   2,   1,   9,  22,  66,   1,  77]))

In [12]:
np.unique(labels, return_counts=True)

(array(['\n', '#'], dtype='<U1'), array([4785,  215]))