# **1. Install Necessary Libraries**

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install --upgrade xformers
!pip install -q evaluate
!pip install -q rouge_score

In [None]:
import os
os.kill(os.getpid(), 9)

# **2. Import Libraries and Set Up Device**

In [None]:
import torch
from transformers import EncoderDecoderModel, BertTokenizer

# from datasets import load_dataset, load_metric
import datasets

# Import necessary libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
import nltk
import evaluate
from rouge_score import rouge_scorer
import os



# Download the NLTK Punkt tokenizer for sentence splitting
nltk.download('punkt')

# Set up the device for computation (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cuda


# **3. Load and Prepare the Dataset**

In [None]:
# Import Files from Google Drive to Colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
from datasets import load_from_disk

dataset = load_dataset("azzedine/Goud-sum_v2")

dataset

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # 2048 Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.10.7 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
alpaca_prompt_one_shot = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate a title that accurately captures the main ideas and themes of the article.If the output is not in Arabic, please translate it into Arabic.
### Text:
{}

### Title:

"""

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    input_       = examples["article"]
    text = alpaca_prompt_one_shot.format(input_) + EOS_TOKEN # alpaca_prompt_one_shot
    examples["text"] = text
    return examples

dataset2 = dataset_abs["test"].map(formatting_prompts_func, batched = False)

Map:   0%|          | 0/9497 [00:00<?, ? examples/s]

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference


def generate_summaries(text):
    inputs = tokenizer(text, return_tensors = "pt").to("cuda")
    output = model.generate(input_ids = inputs["input_ids"], max_new_tokens = 40, use_cache = True,temperature = 1.5, min_p = 0.1)
    output = tokenizer.batch_decode(output)
    gen_smmary = output[0]
    generated_summary_mistral = gen_smmary
    return generated_summary_mistral

In [None]:
from tqdm import tqdm
abstract_generated_sum = []
for i in tqdm(range(len(dataset2))):
  text= dataset2[i]["text"]
  gen_sum = generate_summaries(text)
  abstract_generated_sum.append(gen_sum)

In [None]:
# Add the new column to the dataset
test_dataset = dataset2.add_column("mistral_generated_summary", abstract_generated_sum)
test_dir = f"{output_dir}/mistral_summaires"
test_dataset.save_to_disk(split_dir)

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]