## install dependancies

In [1]:
# install Hugging Face Libraries
!pip install peft
!pip install transformers datasets accelerate evaluate bitsandbytes loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score

Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.9.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you hav

## Loading data

In [2]:
import pandas as pd
df = pd.read_csv('/kaggle/input/full-data/train_data_full.csv')
df.head()

Unnamed: 0,link,transcript,transcript_length,language,summary
0,https://www.youtube.com/watch?v=9He4UBLyk8Y&li...,in this video I'm going to explain all the cor...,11309,English,This video provides a comprehensive overview o...
1,https://www.youtube.com/watch?v=1m8NRrLLgkg&li...,I would strongly recommend studying computer s...,791,English,The video strongly recommends studying compute...
2,https://www.youtube.com/watch?v=8M3YqhO3kVs&li...,have you ever noticed this little icon on your...,647,English,"In the video, the speaker explains how to cust..."
3,https://www.youtube.com/watch?v=VrSJhTGMM90&li...,so what is python used for python is actually ...,522,English,"In this video, the main ideas discussed are th..."
4,https://www.youtube.com/watch?v=YY2ZhUQZyE4&li...,are you a professional developer or someone wh...,556,English,"In this video, the speaker discusses the benef..."


In [3]:
# arabic video that had wrong english transcripts
df.drop(df.iloc[2600:2609].index , inplace = True)
df = df.reset_index(drop = True)

In [4]:
from datasets import load_dataset

ds = load_dataset("ccdv/cnn_dailymail" , '3.0.0')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/9.27k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
df = df.drop(['link' , 'transcript_length' , 'language'] , axis = 1).rename(columns = {'transcript': 'text'})
df

Unnamed: 0,text,summary
0,in this video I'm going to explain all the cor...,This video provides a comprehensive overview o...
1,I would strongly recommend studying computer s...,The video strongly recommends studying compute...
2,have you ever noticed this little icon on your...,"In the video, the speaker explains how to cust..."
3,so what is python used for python is actually ...,"In this video, the main ideas discussed are th..."
4,are you a professional developer or someone wh...,"In this video, the speaker discusses the benef..."
...,...,...
7475,guys welcome back to the third video in my Pyt...,"In this video tutorial on Python programming, ..."
7476,guys so welcome to the second video in the ser...,This video is the second in a series about Pyt...
7477,guys so welcome back today I'm going to be doi...,"In this video, the presenter introduces viewer..."
7478,there and welcome to the space shooter game th...,"In this video, the creator covers the final to..."


In [6]:
import random
import numpy as np

random.seed(44)

indices_to_remove = np.random.choice(df.index, 750, replace=False)

val_df = df.loc[indices_to_remove]

train_df = df.drop(indices_to_remove)

train_df.shape , val_df.shape

((6730, 2), (750, 2))

In [7]:
train_ds = ds["train"].select(random.sample(range(len(ds['train'])), 6000))
val_ds = ds["validation"].select(random.sample(range(len(ds['validation'])), 750))
train_ds , val_ds

(Dataset({
     features: ['article', 'highlights', 'id'],
     num_rows: 6000
 }),
 Dataset({
     features: ['article', 'highlights', 'id'],
     num_rows: 750
 }))

In [8]:
train_ds = train_ds.remove_columns('id')
train_ds = train_ds.rename_column('article' , 'text')
train_ds = train_ds.rename_column('highlights' , 'summary')

In [9]:
val_ds = val_ds.remove_columns('id')
val_ds = val_ds.rename_column('article' , 'text')
val_ds = val_ds.rename_column('highlights' , 'summary')

In [10]:
from datasets import DatasetDict , Dataset, concatenate_datasets

ds = DatasetDict({
    'train': concatenate_datasets([train_ds , Dataset.from_pandas(train_df) ]),
    'validation': concatenate_datasets([val_ds , Dataset.from_pandas(val_df) ])
})
ds = ds.shuffle(seed = 42)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 12730
    })
    validation: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 1500
    })
})

## Convert text to text to token IDs

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
max_input_length = 1024
max_target_length = 500


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True, remove_columns=ds['train'].column_names)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/12730 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## Fine-Tune T5 with LoRA and bnb int-8

In [43]:
from transformers import AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [44]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r = 8,
 lora_alpha = 16,
 target_modules=["q_proj", "v_proj"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)


# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 442,368 || all params: 139,862,784 || trainable%: 0.3162871404018384




In [15]:
print(model)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): BartForConditionalGeneration(
      (model): BartModel(
        (shared): Embedding(50265, 768, padding_idx=1)
        (encoder): BartEncoder(
          (embed_tokens): Embedding(50265, 768, padding_idx=1)
          (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
          (layers): ModuleList(
            (0-5): 6 x BartEncoderLayer(
              (self_attn): BartSdpaAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (defaul

In [45]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer,model=model)

In [17]:
from huggingface_hub import login

login(token = "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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [46]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="lora-bart-base-fine-tuned-youtube-cnn-2"

batch_size = 12
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_dataset["train"]) // batch_size
model_name = model_id.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    gradient_accumulation_steps = 3,
     warmup_steps=1000,  # Gradually increase learning rate for the first 1000 steps
    lr_scheduler_type="linear",  # Linearly decrease after warmup
    push_to_hub=True,
)

In [20]:
!pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [21]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
import evaluate
rouge_score = evaluate.load("rouge")

In [40]:
from nltk.tokenize import sent_tokenize

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [48]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [49]:
trainer.train() # wandb token: da537992f9e4b785775d4a7fba53b76a16283a34

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
0,No log,2.291288,21.2084,9.6759,17.1232,19.5148
1,No log,2.158853,21.2645,10.0984,17.4632,19.6629
3,2.773200,2.068147,21.4251,10.618,17.7683,19.9053
4,2.773200,2.045544,21.3484,10.6276,17.7289,19.8254
6,2.345200,2.019005,21.6356,10.8853,17.9451,20.1188
7,2.345200,2.021565,21.5335,10.8567,17.8954,20.0417


Checkpoint destination directory lora-bart-base-fine-tuned-youtube-cnn-2/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory lora-bart-base-fine-tuned-youtube-cnn-2/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory lora-bart-base-fine-tuned-youtube-cnn-2/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2824, training_loss=2.4891402809207905, metrics={'train_runtime': 9296.8191, 'train_samples_per_second': 10.954, 'train_steps_per_second': 0.304, 'total_flos': 6.225539789876429e+16, 'train_loss': 2.4891402809207905, 'epoch': 7.98})

In [33]:
import torch

# Clear CUDA memory
torch.cuda.empty_cache()

In [50]:
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [62]:
input_text = '''
Today I’m showcasing six cool tools that  convert code into architectural diagrams.
Whether you’re a developer documenting  systems or a tech lead sharing knowledge,  
I think you’ll see some awesome options here.
First up is Diagrams - a Python library that  lets you draw cloud system architectures in  
code. It was created for rapidly prototyping  new designs without separate diagramming tools.
Representing diagrams as code allows tracking  of diagram changes in version control systems.
Go Diagrams
This "diagram as code" approach bridges  documentation with system implementation.
Diagrams supports visualizing infrastructure  across major providers and stacks:
AWS, Azure, GCP, Kubernetes, and more.
It can also model on-premise nodes, SaaS services,  and major programming frameworks and languages.
The extensive catalog of icons  and intuitive syntax accelerates  
diagram creation for modern tech stacks.
If you prefer Go, there is Go-Diagrams.  It’s the same idea as the python version,  
but let’s you write in Go.
Next is Mermaid - it enables creating diagrams  and visualizations using text. As a JavaScript  
Mermaid
library, Mermaid uses Markdown-style text  definitions that feed into a renderer to  
modify complex diagrams. Their stated goal is to  help documentation keep pace with development.
Mermaid aims to solve "doc-rot"  - where diagramming and docs take  
precious developer time yet  still get outdated quickly.
This ruins productivity and  organizational learning.
Mermaid enables even non-programmers to create  detailed visuals through the Mermaid Live Editor.
If you want an even more powerful  diagramming tool, check out PlantUML.
Plan URL
It offers a domain-specific language to  generate many diagram types: sequence  
diagrams, architectural diagrams, network  topology, Gantt charts, and even ASCII art.
PlantUML’s language is very capable but  has a bit more learning curve compared  
to other tools we covered. The broad  features make PlantUML a flexible,  
powerful option for embedding  diagrams alongside code.
SQEditors
The next category of tools goes in the  opposite direction - ASCII diagram editors.
These tools allow you to draw diagrams visually  or in text and then render them as ASCII art. They  
harness the power and simplicity of plain  text, which has been around for decades.
ASCII editors let you easily author  text-based diagrams, layouts, flow charts,  
and more. Since they output in plain text  format, these diagrams can embed anywhere.
MarkMap
Some examples of this class  of tools include web-based  
asciiflow and Monodraw, which is Mac only.
Finally, Markmap creates and visualizes mind  maps derived from Markdown documents. It  
parses Markdown content and extracts  its inherent hierarchies to render  
a mindmap. It’s great for connecting ideas  and their relationships defined in writing.
It supports various platforms but may not  work well on very large or complex mind maps.
if you like our videos, you might like  our System Design Newsletter as well.
It covers topics in trends  and large-scale system design.
Trusted by 500,000 readers.
Subscribe it at blog.bytebytego.com.
'''

In [68]:
model_input = tokenizer(input_text , truncation = False, return_tensors="pt")

In [69]:
summary_ids = model.generate(**model_input , max_length=400, min_length=50, do_sample = True, length_penalty=2.0, num_beams=4, top_k=50, top_p=0.95, temperature=0.8)
summary_ids

tensor([[    2,     0,  1121,    42,   569,     6,     5, 14847, 22848,   411,
          3035,  3270,    14, 10304,  3260,    88, 19481, 41882,     4,   252,
           680,  3643, 17654,    29,   111,    10, 31886,  5560,    14,  8382,
            47,  2451,  3613,   467, 41885,    11,  1437,  1437,  1437, 48619,
            12, 20414,     4,    85,    21,  1412,    13,  6042, 40004,   154,
          1437,    92,  7191,   396,  2559, 41071,  7059,  3270,     4,   152,
          1548, 11879, 14877,    19,   467,  5574,     6,     8,    24,  4548,
          7133,  2787,  2112,   420,   538,  4898,     8, 32201,   101, 26177,
             6, 25959,     6,   272,  7496,     6,  9609,  1943,  4135,   293,
             6,     8,    55,     4,    20, 14847,    67, 19197,     5,  2568,
          5149,  7438, 21794,     6,    61,  4865,  7614,    11,  3926,     8,
           739,    12,  8056,   467,  1521,     4,     2]])

In [70]:
summary_ids.shape

torch.Size([1, 117])

In [71]:
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [72]:
summary_text

'In this video, the presenter showcases six cool tools that convert code into architectural diagrams. They include Diagrams - a Python library that lets you draw cloud system architectures in   GUI-code. It was created for rapidly prototyping  new designs without separate diagramming tools. This approach bridges documentation with system implementation, and it supports visualizing infrastructure across major providers and stacks like AWS, Azure, GCP, Kubernetes, and more. The presenter also mentions the upcoming System Design Newsletter, which covers topics in trends and large-scale system design.'

In [57]:
!pip install lsg-converter

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting lsg-converter
  Downloading lsg_converter-0.1.9-py3-none-any.whl.metadata (9.4 kB)
Downloading lsg_converter-0.1.9-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.3/129.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lsg-converter
Successfully installed lsg-converter-0.1.9


In [58]:
model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=768, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lo

In [61]:
from lsg_converter import LSGConverter

converter = LSGConverter(max_sequence_length=4096)

# Example 1
model, tokenizer = converter.convert_from_pretrained(output_dir)

OSError: lora-bart-base-fine-tuned-youtube-cnn-2 does not appear to have a file named config.json. Checkout 'https://huggingface.co/lora-bart-base-fine-tuned-youtube-cnn-2/None' for available files.