In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# cuda 12.1 version
!pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121

!pip install torch sentence-transformers
!pip install PyPDF2 triton
!pip3 install torchvision torchaudio

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [None]:

# 4bit pre quantized models unsloth support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


In [None]:
from datasets import load_dataset

# Define your chat_template
chat_template = """system

{SYSTEM}user

{INPUT}assistant

{OUTPUT}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

# Define your formatting function
def formatting_prompts_func(examples):
    systems = examples["system"]  # Assuming you have a "system" field in your data
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for system, input, output in zip(systems, inputs, outputs):
        text = chat_template.format(SYSTEM=system, INPUT=input, OUTPUT=output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load your custom dataset
dataset_dict = load_dataset("json", data_files="/content/Training_dataset.json")

# Select the 'train' split
dataset = dataset_dict["train"]

# Apply the formatting function
dataset = dataset.map(formatting_prompts_func, batched=True)


# # ------------------------------------------------------------------------------------------------
# [
#   {
#     "system": "You are a helpful assistant.",
#     "input": "What is the capital of France?",
#     "output": "The capital of France is Paris."
#   },
#   {
#     "system": "You are an expert in geography.",
#     "input": "Name the largest desert in the world.",
#     "output": "The largest desert in the world is the Antarctic Desert."
#   }
# ]


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 6,  # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        # evaluation_strategy = "steps",  # Evaluate at each logging step
        # eval_steps = 10,  # Evaluate every 10 steps (you can adjust this value)
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        logging_dir = "logs",  # Directory to save logs
    ),
)

In [None]:

#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# Train the model
trainer_stats = trainer.train()

In [None]:
print(trainer_stats)

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


In [None]:
# system_text = "You are an AI assistant. give the anser for the Question from the Following given context"

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     chat_template.format(
#             SYSTEM=system_text,  # Provide the SYSTEM value
#             INPUT="context: Management of brain metastasis from rectal cancer using whole‑brain radiation therapy followed by bevacizumab and chemotherapy: A case report HUNG VAN NGUYEN1,2, DUONG THUY PHUNG2, TRUNG THANH NGUYEN2, BACH TRUNG TRAN2 , KIM NGAN THI MAI1,2 and HUY LE TRINH1,2 1 Department of Oncology and Palliative Care, Hanoi Medical University Hospital;  2 Department of Oncology, Hanoi Medical University, Hanoi 100000, Vietnam Received February 7, 2023; Accepted July 17, 2023.  Question: what is the address of primary author.",
#             OUTPUT="",  # This is the empty output in the template
#     )
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# decoded_outputs = tokenizer.batch_decode(outputs)

# print(decoded_outputs)

In [None]:
# Set the SYSTEM text
system_text = "You are an AI assistant. give the anser for the Question from the Following given context"


# Use the new context and question as the INPUT
inputs = tokenizer(
    [
        chat_template.format(
            SYSTEM=system_text,  # SYSTEM value
            INPUT="context:-Management of brain metastasis from rectal cancer using whole‑brain radiation therapy followed by bevacizumab and chemotherapy: A case report HUNG VAN NGUYEN1,2, DUONG THUY PHUNG2, TRUNG THANH NGUYEN2, BACH TRUNG TRAN2 , KIM NGAN THI MAI1,2 and HUY LE TRINH1,2 1 Department of Oncology and Palliative Care, Hanoi Medical University Hospital;  2 Department of Oncology, Hanoi Medical University, Hanoi 100000, Vietnam Received February 7, 2023; Accepted July 17, 2023.  Question:- what is the address of primary author, please provide only one address. ",
            OUTPUT="",  # Leave OUTPUT blank for generation
        )
    ],
    return_tensors="pt"
).to("cuda")

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Initialize TextStreamer for live streaming of generated text
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Generate the response from the model
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)
print(_)


In [None]:
# Merge to 16bit
if True: model.save_pretrained_merged("llama_3_1_8b_Instruct_model", tokenizer, save_method = "merged_16bit",)

# # Merge to 16bit
# if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
# if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# # Merge to 4bit
# if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
# if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")


Download model using rcloud

**Copying the finetuned model to Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil
import os

# Source and destination paths
source_dir = "/content/llama_3_1_8b_Instruct_model"
destination_dir = "/content/drive/MyDrive/Llama 3.1 8b Instruct finetuning/Finetuned Model"

# List all items in the /content directory
items = os.listdir(source_dir)

# Iterate through all items and copy them to the destination directory
for item in items:
    # Skip the 'drive' folder to avoid copying it
    if item != "drive":
        # Construct full file or folder path
        source_path = os.path.join(source_dir, item)
        destination_path = os.path.join(destination_dir, item)

        # Check if the item is a file or a directory
        if os.path.isdir(source_path):
            # Copy the directory and its contents
            shutil.copytree(source_path, destination_path)
        else:
            # Copy the file
            shutil.copy2(source_path, destination_path)

print("Files and folders copied successfully!")


**RUN MODEL LOCALLY**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# cuda 12.1 version
!pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
!pip3 install torchvision torchaudio
!pip install torch sentence-transformers
!pip install PyPDF2

In [None]:
from unsloth import FastLanguageModel
from transformers import TextStreamer


In [None]:

# Load the locally saved model and tokenizer
model_path = "/content/drive/MyDrive/Llama 3.1 8b Instruct finetuning/Finetuned Model"
model, tokenizer = FastLanguageModel.from_pretrained(model_path)


In [None]:

# Prepare the input for the model
system_text = """You are an AI assistant. Provide the answer to the question based on the given context."""
chat_template = """system

{SYSTEM}user

{INPUT}assistant

{OUTPUT}"""
input_text = chat_template.format(
    SYSTEM=system_text,
    INPUT="context: Management of brain metastasis from rectal cancer using whole‑brain radiation therapy followed by bevacizumab and chemotherapy: A case report HUNG VAN NGUYEN1,2, DUONG THUY PHUNG2, TRUNG THANH NGUYEN2, BACH TRUNG TRAN2 , KIM NGAN THI MAI1,2 and HUY LE TRINH1,2 1 Department of Oncology and Palliative Care, Hanoi Medical University Hospital;  2 Department of Oncology, Hanoi Medical University, Hanoi 100000, Vietnam Received February 7, 2023; Accepted July 17, 2023. Question: What is the address of the primary author?",
    OUTPUT=""  # Leave OUTPUT blank for generation
)

inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)
# Initialize TextStreamer for live streaming of generated text (optional)
text_streamer = TextStreamer(tokenizer)
# Generate the response from the model
outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
# Decode the output
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# Print the result
print(decoded_outputs)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
import PyPDF2
import torch
from sentence_transformers import SentenceTransformer, util

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')

In [None]:
# Function to read and extract text from a PDF file
def read_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text

pdf_path = "/content/38022102.pdf"
pdf_text = read_pdf(pdf_path)


In [None]:
def chunk_text_with_overlap(text, max_chunk_size=750, overlap_size=100):
    """
    Chunk text with overlapping sentences.

    :param text: The input text to be chunked.
    :param max_chunk_size: Maximum number of tokens per chunk.
    :param overlap_size: Number of tokens to overlap between chunks.
    :return: List of text chunks with overlap.
    """
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    total_sentences = len(sentences)
    sentence_index = 0

    while sentence_index < total_sentences:
        # Add sentences to the current chunk
        while sentence_index < total_sentences and current_length + len(sentences[sentence_index].split()) <= max_chunk_size:
            current_chunk.append(sentences[sentence_index])
            current_length += len(sentences[sentence_index].split())
            sentence_index += 1

        # Append the current chunk to the list
        chunks.append(" ".join(current_chunk))

        # Reset for the next chunk
        current_chunk = []
        current_length = 0

        # Add overlap
        overlap_count = min(overlap_size, total_sentences - sentence_index)
        for _ in range(overlap_count):
            if sentence_index < total_sentences:
                current_chunk.append(sentences[sentence_index])
                current_length += len(sentences[sentence_index].split())
                sentence_index += 1

    return chunks

# Example usage
chunks = chunk_text_with_overlap(pdf_text, max_chunk_size=750, overlap_size=100)


In [None]:
max_new_tokens=300

In [None]:
def get_context(chunks,query):
  # Load a pre-trained sentence transformer model
  embedder = SentenceTransformer('all-MiniLM-L6-v2')

  # Generate embeddings for the chunks
  chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)

  # Generate embedding for the query
  query_embedding = embedder.encode(query, convert_to_tensor=True)

  # Compute cosine similarities between the query and all chunks
  cosine_scores = util.pytorch_cos_sim(query_embedding, chunk_embeddings)

  # Find the most similar chunk (highest cosine similarity score)
  top_k = 2  # You can retrieve more chunks by increasing this value
  top_results = torch.topk(cosine_scores, k=top_k)

  # Extract the most relevant chunk(s)
  relevant_chunks = [chunks[idx] for idx in top_results[1][0]]

  # Join the relevant chunks if needed
  context = " ".join(relevant_chunks)
  return context

def result(input_text=input_text, model=model,tokenizer=tokenizer,max_new_tokens=max_new_tokens):
  inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

  # Enable native 2x faster inference
  FastLanguageModel.for_inference(model)

  # Initialize TextStreamer for live streaming of generated text (optional)
  text_streamer = TextStreamer(tokenizer)

  # Generate the response from the model
  outputs = model.generate(**inputs, max_new_tokens= max_new_tokens,use_cache=True)

  # Decode the output
  decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  return decoded_outputs


**1. Primary Author Address**

In [None]:
query = "What is the address of the Primart Author?"

## Prepare the input with a selected chunk of text
context = get_context(chunks, query)
# context = pdf_text

# System text for LLaMA 3.1 8B model
system_text = """
You are a literature analyst with the ability to understand the provided context and figure out the appropriate answers for the asked queries from the context only.
It's important to give accurate answers without making assumptions.

Rules:
1. Do not generate information not present in the context.
2. Follow the response format very strictly.
3. Do not display any thought process in the response.
4. Focus on author affiliations and contributions for the address. Prioritize addresses starting with 1, A, or I.
5. If multiple addresses are present, use the first address as the primary author's address, and Strictly dont give rest all addresses in the responce.
6. If a city or state is mentioned, deduce the corresponding country.
7. Please dont print the context in the Responce.
8. Response Format: - Primary Author's Address: [Address]

"""

# Formatting the input for the model
chat_template = """system

{SYSTEM}user

{INPUT}assistant

{OUTPUT}"""

input_text = chat_template.format(
    SYSTEM=system_text,
    INPUT=f"context: {context}\nQuestion: {query}\n",
    OUTPUT=""
)
max_new_tokens=300
# Generate the response from the model
primary_author_address = result(input_text, model, tokenizer,max_new_tokens)

# Print the result
print(primary_author_address[0])


In [None]:
# Ensure it's a string by accessing the first element if it's a list
if isinstance(primary_author_address, list):
    primary_author_address = primary_author_address[0]


# Split the string based on the word "assistant"
address = primary_author_address.split("assistant")
address = address[1].split("2.")

In [None]:
print(address[0])

**2. Patient Details**

In [None]:
drug="Hydralazine"
query= f"Details of all the single patients present with respect to the {drug}"
query = query.replace("{drug}", drug)
# Prepare the input with a selected chunk of text
context = get_context(chunks,query)
system_text= f""" You are a literature analyst with the ability to understand the provided context and answer the queries related to patient information from the context only.
        Follow the rules strictly.

        Rules:
        1. Don't hallucinate.
        2. You must follow the Response format.
        3. You must not display any thought process of yours in the response.
        4. Extract all patient-related information such as age, gender, and other relevant details from the input text.

        Queries:
        - Provide patient information such as age, gender, and any other relevant details mentioned in the text.

        Please use the provided input to give the response with respect to the above rules.

        Carefully differentiate between patients and authors—do not mix up their names. Do not repeat patients in the total count or when providing their information. Also, differentiate each patient. If one patient is mentioned in multiple places, do not consider them as different persons. Do not consider patients present in table format data. Remember to check if the patient information is from table format data and exclude such cases.
        Consider only patients related to {drug}. Do not consider any other patients.
        Please determine if the patient is human and related to {drug}. If the patient is human, under the heading '# Patient Validation' give the answer as Valid. If not, give the answer as Invalid. There should be only Valid or Invalid present in the answer, nothing else.
        (Only consider single patients like '39-Year-Old Male', '39-Year-Old Female', and strictly not groups of patients like '204 Patients', '6 patients', etc.). In the subsequent line, under the heading '# Total Number of Patients', provide the total number of patients present who are related to {drug}. If no single patient is present, give the answer as 0.
        In the last line, under the heading '# Patient Type', list all the patients and include small and concise information about them who are related to {drug}. Include information on all cases of patients in the above text. If no single patient is present, give the answer as 'No patient present.'
        Ensure that all information is accurate and based on factual data present in the text. Do not provide hallucinated responses or give answers that are not present in the text.
"""
system_text = system_text.replace("{drug}", drug)
chat_template = """system

{SYSTEM}user

{INPUT}assistant

{OUTPUT}"""

input_text = chat_template.format(
    SYSTEM=system_text,
    INPUT=f"context: {context} Question: {query}",
    OUTPUT=""
)
patient_details=result(input_text, model,tokenizer)
# Print the result
patient_details[0]


In [None]:
# Ensure it's a string by accessing the first element if it's a list
if isinstance(patient_details, list):
    patient_details = patient_details[0]

patientDetails = patient_details.split("assistant")
print(patientDetails[1])

In [None]:
# !pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
# !pip3 install -U trl<0.9.0 --index-url https://download.pytorch.org/whl/cu121
# !pip3 install -U peft --index-url https://download.pytorch.org/whl/cu121
# !pip3 install -U accelerate --index-url https://download.pytorch.org/whl/cu121
# !pip3 install -U bitsandbytes --index-url https://download.pytorch.org/whl/cu121