# Package

# Install packages 

In [None]:
!pip install transformers trl accelerate torch bitsandbytes peft langchain datasets -qU

# Initialization

In [None]:
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, GenerationConfig, LlamaForCausalLM, LlamaTokenizer
from sklearn.model_selection import train_test_split
import transformers
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import copy
import datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from trl import SFTTrainer

# Specify the path to your Excel file

In [None]:

# Train set
PATH = "/kaggle/input/465analysisis/Our_trainset.xlsx"

try:
    trainset= pd.read_excel(PATH)

    print(trainset.head())

except Exception as e:
    print(f"Error loading dataset: {e}")


In [None]:

# Test Set
PATH = "/kaggle/input/465analysisis/Our_testset.xlsx"

try:
    testset = pd.read_excel(PATH)

    print(testset.head())

except Exception as e:
    print(f"Error loading dataset: {e}")


In [None]:

# Validation Set
PATH = "/kaggle/input/465analysisis/Our_validationset.xlsx"

try:
    validationset = pd.read_excel(PATH)

    print(validationset.head())

except Exception as e:
    print(f"Error loading dataset: {e}")


# Chunk-based Pre-Processing 

In [None]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Instantiate RecursiveCharacterTextSplitter with custom parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len,
    #separators=['\n\n','MP:', '\n', ' ', '']
)

# Function to split text into chunks using the text splitter
def split_text_into_chunks(text):
    return text_splitter.split_text(text)

# Create a list to store chunked data

In [None]:
# Train Data
train_df = pd.DataFrame({"Paper": trainset["Main_Paper"], "Meta_Paper": trainset["SP"]})

chunked_train_data = {"Paper": [], "Meta_Paper": []}

for i, row in train_df.iterrows():
    context = row["Meta_Paper"]
    label = row["Paper"]
    #print(len(context))
    
    # Split the context into overlapping chunks
    chunks = split_text_into_chunks(context)
    for chunk in chunks:
        chunked_train_data["Paper"].append(label)
        chunked_train_data["Meta_Paper"].append(chunk)

# Create a new DataFrame with the chunked data
chunked_train_df = pd.DataFrame(chunked_train_data)



In [None]:
chunked_train_df

Unnamed: 0,Paper,Meta_Paper
0,Background The available evidence about the p...,MP: ObjectiveSeveral randomized controlled tri...
1,Background The available evidence about the p...,"hormonal and inflammatory parameters, or can i..."
2,Background The available evidence about the p...,and synbiotics supplementation in lipid profil...
3,Background The available evidence about the p...,and meta-analysis. Mean Difference (MD) was po...
4,Background The available evidence about the p...,on homeostatic model assessment-insulin resist...
...,...,...
3835,Available on the association between the Medi...,"incidents, histologically confirmed gastric ca..."
3836,The effective management of the 33 million chi...,MP: Despite the decreasing rate of under nutri...
3837,The effective management of the 33 million chi...,No adverse reactions were observed. There were...
3838,The effective management of the 33 million chi...,Three-factor analysis of covariance of the eff...


In [None]:
# Validation Data
val_df = pd.DataFrame({"Paper": validationset["Main_Paper"], "Meta_Paper": validationset["SP"]})


chunked_val_data = {"Paper": [], "Meta_Paper": []}

for i, row in val_df.iterrows():
    context = row["Meta_Paper"]
    label = row["Paper"]

    # Split the context into overlapping chunks
    chunks = split_text_into_chunks(context)
    for chunk in chunks:
        chunked_val_data["Paper"].append(label)
        chunked_val_data["Meta_Paper"].append(chunk)

# Create a new DataFrame with the chunked data
chunked_val_df = pd.DataFrame(chunked_val_data)

In [None]:
# test data
test_df = pd.DataFrame({"Paper": testset["Main_Paper"], "Meta_Paper": testset["SP"]})

chunked_test_data = {"Paper": [], "Meta_Paper": []}

for i, row in test_df.iterrows():
    context = row["Meta_Paper"]
    label = row["Paper"]

    # Split the context into overlapping chunks
    chunks = split_text_into_chunks(context)
    for chunk in chunks:
        chunked_test_data["Paper"].append(label)
        chunked_test_data["Meta_Paper"].append(chunk)

# Create a new DataFrame with the chunked data
chunked_test_df = pd.DataFrame(chunked_test_data)

In [None]:
trainset = datasets.Dataset.from_dict(chunked_train_df)
valset = datasets.Dataset.from_dict(chunked_val_df)
testset = datasets.Dataset.from_dict(chunked_test_df)

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

# Load Model with QLoRA

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Initialize Instruction based Fine-tuning

In [None]:
DEFAULT_SYSTEM_PROMPT = """
Given a collection of abstracts from papers used in various medical fields, generate a meta-analysis abstract summarizing the key findings of those abstracts and provide numerical values or statistical information for specific observations that are commonly reported in the provided abstracts. Some provided abstracts may have chunks, so maintain information similarities.
""".strip()

In [None]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['Meta_Paper'])):
        text = f"### {DEFAULT_SYSTEM_PROMPT},### Abstracts: {example['Meta_Paper'][i]}  \n ### Meta-Analysis Abstract: {example['Paper'][i]}"
        

        output_texts.append(text)
    return output_texts



In [None]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def softmax_selection(predictions, temperature=1.0, dim=-1):
    """
    Apply softmax to model predictions and sample a token based on the resulting probabilities.

    Args:
        predictions (torch.Tensor): The tensor containing the raw predictions from the model.
        temperature (float): Temperature parameter to adjust the sharpness of the probability distribution.
                              A lower temperature makes the distribution sharper.

    Returns:
        torch.Tensor: Tensor containing the selected token IDs.
    """
   # print(predictions.dim())
    if predictions.dim() > 2:
        predictions = predictions.view(-1, predictions.size(-1))
    # Apply softmax with temperature
    probs = F.softmax(predictions / temperature, dim=-1)
  

    # Sampling a token based on the probabilities
    sampled_tokens = torch.multinomial(probs, 1)

    return sampled_tokens

In [None]:
def quantities(decoded_predictions, decoded_labels):
    """
    Extract quantities from decoded predictions and labels.

    Args:
        decoded_predictions (list): List of decoded predictions.
        decoded_labels (list): List of decoded labels.

    Returns:
        tuple: A tuple containing lists of predicted quantities and actual quantities.
    """
    predicted_quantities = []
    actual_quantities = []

    for prediction, label in zip(decoded_predictions, decoded_labels):
        # Placeholder: Extracting the first number encountered in the decoded strings
        predicted_quantity = extract_quantity(prediction)
        actual_quantity = extract_quantity(label)
        predicted_quantities.append(predicted_quantity)
        actual_quantities.append(actual_quantity)

    return predicted_quantities, actual_quantities

def extract_quantity(text):
    """
    Extract the first numeric value from a string.

    Args:
        text (str): Input string.

    Returns:
        float: Extracted numeric value or NaN if no numeric value found.
    """
    import re
    # Regular expression to match numeric values
    numeric_pattern = re.compile(r'[-+]?[0-9]*\.?[0-9]+')
    # Search for numeric values in the text
    match = re.search(numeric_pattern, text)
    if match:
        # Extract and convert the first numeric value found
        return float(match.group())
    else:
        # Return NaN if no numeric value found
        return float('nan')

In [None]:
from sklearn.model_selection import train_test_split
import datasets
#dataset['tokenized_label'] = llama_tokenizer(dataset['Label'], return_tensors='pt')
trainset, testset = train_test_split(
    dataset,
    test_size=0.2
)
trainset = datasets.Dataset.from_dict(trainset)
testset = datasets.Dataset.from_dict(testset)
type(trainset)

# Inverse Cosine Distance (ICD) Loss function implementation 

In [None]:


class CustomSFTTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract and remove labels from inputs
        labels = inputs.get("labels")
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits  # Assuming the model outputs logits
        
        # Ensure logits and labels are float tensors
        logits = logits.float()
        labels = labels.float()
        
        # Reshape logits and labels
        logits = logits.view(logits.size(0), logits.size(1), -1)  
        labels = labels.view(labels.size(0), labels.size(1), -1)  
        
        # Normalize embeddings for cosine similarity
        logits = F.normalize(logits, p=2, dim=-1)
        labels = F.normalize(labels, p=2, dim=-1)

        # Calculate cosine similarity
        cosine_sim = torch.cosine_similarity(logits, labels, dim=-1)
        
        # Calculate loss using inverse of cosine similarity with epsilon for numerical stability
        epsilon = torch.tensor(1e-8, device=logits.device)
        loss = torch.mean(1 / (cosine_sim + epsilon))

        return (loss, outputs) if return_outputs else loss


# Training

In [None]:
# Loading LORA weights
base_model.config.use_cache = False

from peft import prepare_model_for_kbit_training
PATH1 = "/finetuned_llama2_cosine_sim_p1"

from peft import PeftModel, PeftConfig
base_model = PeftModel.from_pretrained(base_model, PATH1)
# ---- ending LORA weights -----

# Training Params
train_params = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    fp16=False,
    max_grad_norm=0.3,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    save_safetensors=True,
    lr_scheduler_type="constant",
    output_dir="./results_modified_test",
    #num_train_epochs=1,
    #per_device_train_batch_size=2,
    #gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=2,
    logging_steps=2,
    learning_rate=2e-4,
    weight_decay=0.001,
    #fp16=False,
    bf16=False,
    #max_grad_norm=0.3,
    max_steps=-1,
    #warmup_ratio=0.05,
    #group_by_length=True,
    #lr_scheduler_type="constant",
    #report_to="wandb",
    # push to hub parameters,
   # push_to_hub=True,
)




# Trainer
fine_tuning = CustomSFTTrainer(
    model=base_model,
    train_dataset=trainset,
    eval_dataset = valset,
    peft_config=peft_parameters,
    # dataset_text_field=train_dataset["Statement"],
    tokenizer=llama_tokenizer,
    args=train_params,
    #max_seq_length = 2048,
    formatting_func=formatting_prompts_func,
    # data_collator=collator,
)

In [None]:
fine_tuning.train()

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


[34m[1mwandb[0m: Currently logged in as: [33mjbas3235[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
768,1.0452,0.970265
1536,1.0421,0.967326
2304,1.038,0.964995
3072,1.0392,0.964744
3840,0.8956,0.964325




TrainOutput(global_step=3840, training_loss=1.0367341736331581, metrics={'train_runtime': 38294.4796, 'train_samples_per_second': 0.201, 'train_steps_per_second': 0.1, 'total_flos': 1.5332062395777024e+17, 'train_loss': 1.0367341736331581, 'epoch': 2.0})

In [None]:
fine_tuning.save_model("finetuned_llama2_cosine_sim_p1")

In [None]:
fine_tuning.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v_

# Meta-Analysis Generation

In [32]:
def summarize(text: str):
    inputs = llama_tokenizer(text, return_token_type_ids=False,return_tensors="pt").to("cuda")
#     inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = base_model.generate(**inputs, max_new_tokens=1024, do_sample=True, temperature=0.7)

#         answer_tokens = outputs[:, inputs.input_ids.shape[1] :]
        decoded_output = llama_tokenizer.batch_decode(outputs, skip_special_tokens=True)

#         output_text = llama_tokenizer.decode(answer_tokens[0], skip_special_tokens=True).strip()
    return decoded_output

In [None]:
from peft import prepare_model_for_kbit_training

In [None]:
from peft import PeftModel, PeftConfig

In [None]:
output_dir="/kaggle/working/finetuned_llama2_cosine_sim_p1"

In [None]:
model_t = PeftModel.from_pretrained(base_model, output_dir)

In [None]:
# Initialize an empty list to store rows
rows = []

# Iterate over the texts in testset['Meta_Paper']
for i in range(len(testset['Meta_Paper'])):
    text = f"### {DEFAULT_SYSTEM_PROMPT},### Example Abstracts: {testset['Meta_Paper'][i]}  \n ### Meta-Analysis Abstract: "
    processed_output = summarize(text)
    
    # Extract the abstract from processed_output list
    abstract = ""
    for item in processed_output:
        if '### Meta-Analysis Abstract:' in item:
            abstract = item.split('### Meta-Analysis Abstract:')[1].strip()
            break
            
    print(i)
    print(abstract)
    # Append the "Meta_Paper" value and its processed output as a tuple to the rows list
    rows.append((testset['Meta_Paper'][i], testset['Paper'][i], abstract))

# Create a DataFrame from the rows list
df = pd.DataFrame(rows, columns=['Meta_Paper', 'Ground Truth', 'Processed Output'])

# Print or use the DataFrame as needed
print(df)



0
The present meta-analysis aimed to summarize the key findings of the provided abstracts related to the efficacy and safety of therapeutic plasma exchange (TPE) in critically ill COVID-19 patients. A total of 87 abstracts were included in the analysis. The primary outcome measures were 35-day mortality and TPE safety. Secondary outcomes included days on mechanical ventilation (MV), intensive care unit (ICU) length of stay (LOS), clinical recovery (as measured by SOFA score), and various inflammatory biomarkers. The results showed that TPE was associated with improved clinical recovery (as measured by SOFA score) and reduced days on MV and ICU LOS. However, there was no significant difference in 35-day mortality between the TPE and standard treatment groups. The study found that TPE was associated with increased lymphocytes and ADAMTS-13 activity and decreased serum lactate, lactate dehydrogenase, ferritin, D-dimers, and interleukin-6. Multivariable regression analysis identified sever

In [None]:
df.to_csv("llama2_finetune_consine_similarities.csv")

In [None]:
for i in range(len(testset['Meta_Paper'])):
    text = f"### {DEFAULT_SYSTEM_PROMPT},### Example Abstracts: {testset['Meta_Paper'][i]}  \n ### Meta-Analysis Abstract: "
    pred = []
    processed_output = summarize(text)
    print(processed_output)
    print("\n\n")