In [2]:
import torch
from transformers import AutoTokenizer
from dotenv import load_dotenv
import os

class LLMInterface:
    def __init__(
            self,
            model_name: str,
            device: str = "cuda:0" if torch.cuda.is_available() else "cpu"):
        
        self.device = device
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_auth_token = os.getenv('HF_ACCESS_TOKEN'))

    def preprocesser_inference(
            self,
            input: list[str],
            truncate: bool,
            max_length: int,
            return_tensor: str = "pt"
    ):
        tokenized_text = []
        for text in input:
            tk_txt = self.tokenizer(text, max_length=max_length,padding=pad, truncation=truncate, return_tensors=return_tensor).to(self.device)
            tokenized_text.append(tk_txt)
        return tokenized_text
    
class classifierInterface(LLMInterface):
    def __init__(self, model_name: str, id2label: dict, label2id: dict, num_labels=2, device = "cuda:0" if torch.cuda.is_available() else "cpu"):
        super().__init__(model_name, device)
        load_dotenv()
        from transformers import AutoModelForSequenceClassification
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16 if self.device=="cuda:0" else torch.float32,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id,
            token = os.getenv('HF_ACCESS_TOKEN')
        ).to(self.device)

    def inference_classify(
            self,
            inputs: list[str],
            pad: bool,
            truncate: bool,
            max_length: int,
            return_tensor: str = "pt"
    ):
        classes = []
        inputs = self.preprocesser_inference(inputs, pad, truncate, max_length, return_tensor)
        for input in inputs:
            with torch.no_grad():
                input = {key: value.to(self.device) for key, value in input.items()}
                logits = self.model(**input).logits
            predicted_class = logits.argmax().item()
            classes.append(self.model.config.id2label[predicted_class])
        return classes
    
class extractorInterface(LLMInterface):
    def __init__(self, model_name: str, device: str = 'cuda:0' if torch.cuda.is_available() else "cpu"):
        super().__init__(model_name, device)
        from transformers import AutoModelForTokenClassification
        self.model = AutoModelForTokenClassification.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16 if self.device=="cuda" else torch.float32,
            token = os.getenv('HF_ACCESS_TOKEN')
        ).to(self.device)
    
    def inference_extract(
            self,
            input: str,
            pad: bool,
            truncate: bool,
            max_length: bool,
            return_tensor: str = "pt"
    ):
        input = self.preprocesser_inference(input, pad, truncate, max_length, return_tensor)
        with torch.no_grad():
            logits = self.model(**input).logits
        
        # TODO: RETURN THE APPROPRIATE VALUE, NOT LOGITS, FROM HERE

class EncoderDecoder_Interface(LLMInterface):
    def __init__(self, model_name: str, device: str = 'cuda:0' if torch.cuda.is_available() else "cpu"):
        super().__init__(model_name, device)
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM   
        self.model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if self.device=="cuda" else torch.float32,
            token = os.getenv('HF_ACCESS_TOKEN')
        ).to(self.device)
    
    def preprocesser_inference(self, input, pad, truncate, max_length, return_tensor = "pt"):
        return self.tokenizer(input, max_length=max_length,padding=pad, truncation=truncate, return_tensors=return_tensor).to(self.device)
    
    def inference_seq(
            self,
            input: str,
            pad: bool,
            truncate: bool,
            max_length: int,
            return_tensor: str = "pt"
    ):
        input = self.preprocesser_inference(input, pad, truncate, max_length, return_tensor)
        # input = {key: value.to(self.device) for key, value in input.items()}

        with torch.no_grad():
            output_ids = self.model.generate(
                **input,
                max_length=50,
                num_beams=4,
                temperature=0.7,
                top_p=0.9
            )
        output_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

        return output_text

class EncoderOnly_EDInterface(LLMInterface):
    def __init__(self, model_name: str, device: str = 'cuda:0' if torch.cuda.is_available() else "cpu"):
        super().__init__(model_name, device)
        from transformers import T5EncoderModel, T5Tokenizer
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5EncoderModel.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if self.device=="cuda" else torch.float32,
            token = os.getenv('HF_ACCESS_TOKEN')
        ).to(self.device)
    

    def inference_classify_enc(
            self,
            inputs: list[str],
            pad: bool,
            truncate: bool,
            max_length: int,
            return_tensor: str = "pt"
    ):
        classes = []
        inputs = self.preprocesser_inference(inputs, pad, truncate, max_length, return_tensor)
        for input in inputs:
            with torch.no_grad():
                input = {key: value.to(self.device) for key, value in input.items()}
                logits = self.model(**input)
            predicted_class = logits[0]
            classes.append(self.model.config.id2label[predicted_class])
        return classes
        

## Experimenting with Encoder based models

In [2]:
import json
from datasets import Dataset

with open('/home/tadesa1/ADBMO-UNLV/SentrySys_Experiments/classification_results_openai.json', 'r') as f:
    classified_results = json.load(f)

del classified_results['---']

def data_generator():
    for obj in classified_results:
        try:
            yield {"pmid": obj, "true_class": classified_results[obj]['class'], "title":classified_results[obj]['title'], "abstract": classified_results[obj]['summary']['Abstract'], "method": classified_results[obj]['summary']['Method'], "predicted_class": 0 if classified_results[obj]['openai_response'] == 'No' else 1}
        except KeyError:
            yield {"pmid": obj, "true_class": 0 if classified_results[obj]['openai_response'] == 'No' else 1, "title":classified_results[obj]['title'], "abstract": classified_results[obj]['summary']['Abstract'], "method": classified_results[obj]['summary']['Method'], "predicted_class": 0 if classified_results[obj]['openai_response'] == 'No' else 1}

data = Dataset.from_generator(data_generator)
false_class_data = data.filter(lambda e: e['true_class'] == 0)
true_class_data = data.filter(lambda e: e['true_class'] == 1)

test_data = []
for paper in data:
    test_data.append([paper['title']+paper['abstract']+paper['method']])

if __name__ == "__main__":
    
    model_names = ["bert-base-uncased", "distilbert-base-uncased-finetuned-sst-2-english","nlptown/bert-base-multilingual-uncased-sentiment"]
    
    id2label = {
        0: 'IRRELEVANT',
        1: 'RELEVANT'        
    }

    label2id = {
        'IRRELEVANT': 0,
        'RELEVANT': 1        
    }
    classifier = classifierInterface(model_name=model_names[0], id2label=id2label, label2id=label2id)

    result = []
    for paper in data:
        test_data = [paper['title']+paper['abstract']+paper['method']]
        result.append(classifier.inference_classify(inputs=test_data, pad = True, truncate = True, max_length=512))
        # print(f"{result} : {paper['predicted_class']}")

2025-04-01 02:46:35.471942: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-01 02:46:35.485102: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743475595.499378 1155897 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743475595.504024 1155897 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-01 02:46:35.520433: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

TypeError: preprocesser_inference() takes from 4 to 5 positional arguments but 6 were given

### Evaluation for encoder based models

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

predicted_labels = []
true_labels = []

true_labels = [example['true_class'] for example in data]
# predicted_labels = [print(example[0]) for example in result]
predicted_labels = [1 if res[0] == 'RELEVANT' else 0 for res in result]


print(type(true_labels))
print(type(predicted_labels))
cm = confusion_matrix(true_labels, predicted_labels)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(true_labels), yticklabels=np.unique(true_labels))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
 
# Print classification report
print(classification_report(true_labels, predicted_labels))


## Experimenting with Encoder-Decoder based models

In [27]:
models = ["google/flan-t5-large", "google-t5/t5-base"]
ed_model = EncoderDecoder_Interface(models[0])
# ed_model = EncoderDecoder_Interface(models[1])


input_txt = "Extract the disease names mentioned in the following texts: Amyloid build up is usually associated with Alzheimer's. It can also be associated with other things like Brain Cancer"
# input_txt = "What is not the protein in the following sentence but is still a marker: Amyloid build up is usually associated with Alzheimer's. It is accompanied with an increase in blood pressure"
# input_txt = "Answer the following mathematical question. Give the rational before answering: What is the cube of 2?"
# input_txt = "mnli premise: Donald Trump has a hair color that resemble the color generated by a hot object. hypothesis: Donald Trump has fire like hair"
output_txt = ed_model.inference_seq(input = input_txt, pad=False, truncate=False, max_length=512)
print(output_txt)



Alzheimer's, Brain Cancer


## Experimenting with Encoder only ED models

In [23]:
import json 
from datasets import Dataset

with open('/home/tadesa1/ADBMO-UNLV/SentrySys_Experiments/classification_results_openai.json', 'r') as f:
    classified_results = json.load(f)

del classified_results['---']

def data_generator():
    for obj in classified_results:
        try:
            yield {"pmid": obj, "true_class": classified_results[obj]['class'], "title":classified_results[obj]['title'], "abstract": classified_results[obj]['summary']['Abstract'], "method": classified_results[obj]['summary']['Method'], "predicted_class": 0 if classified_results[obj]['openai_response'] == 'No' else 1}
        except KeyError:
            yield {"pmid": obj, "true_class": 0 if classified_results[obj]['openai_response'] == 'No' else 1, "title":classified_results[obj]['title'], "abstract": classified_results[obj]['summary']['Abstract'], "method": classified_results[obj]['summary']['Method'], "predicted_class": 0 if classified_results[obj]['openai_response'] == 'No' else 1}

data = Dataset.from_generator(data_generator)
false_class_data = data.filter(lambda e: e['true_class'] == 0)
true_class_data = data.filter(lambda e: e['true_class'] == 1)

test_data = []
for paper in data:
    test_data.append([paper['title']+paper['abstract']+paper['method']])

id2label = {
    0: 'IRRELEVANT',
    1: 'RELEVANT'        
}

label2id = {
    'IRRELEVANT': 0,
    'RELEVANT': 1        
}

models = ["google/flan-t5-large"]
encoder_ed_model = EncoderOnly_EDInterface(models[0], id2label=id2label, label2id=label2id)

result = []
for paper in data:
    test_data = [paper['title']+paper['abstract']+paper['method']]
    result.append(encoder_ed_model.inference_classify_enc(inputs=test_data, pad = True, truncate = True, max_length=512))

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

predicted_labels = []
true_labels = []

true_labels = [example['true_class'] for example in data]
# predicted_labels = [print(example[0]) for example in result]
predicted_labels = [1 if res[0] == 'RELEVANT' else 0 for res in result]


print(type(true_labels))
print(type(predicted_labels))
cm = confusion_matrix(true_labels, predicted_labels)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(true_labels), yticklabels=np.unique(true_labels))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
 
# Print classification report
print(classification_report(true_labels, predicted_labels))




TypeError: __init__() got an unexpected keyword argument 'id2label'

## Dataset design

#### How will your datasets be setup?

* For enc-dec models you'd benefit from instruction fine tuning.


# Question

* Will performance improve in all aspects if we perform unsupervised fine tuning on the models?

The goal is to have a general purpose ADBMO model that we can recycle for different purposes. A chatbot with reasoning capabilities, text extraction and classification, and even document searching abilities.
- Let's start off by training adapters that we can plugin to existing models. Let's measure the performance of that for our downstream tasks like text extraction and classification and then see if further fine-tuning is necessary.