In [1]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.3/412.3 kB[0m [31m30

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!nvidia-smi

Mon Feb 12 09:17:19 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

from sklearn.model_selection import train_test_split

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "facebook/bart-large-cnn"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_bart = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
def generate_batch_sized_chunks(list_of_elements, batch_size):

    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""

    for i in range(0, len(list_of_elements), batch_size):

        yield list_of_elements[i : i + batch_size]

In [5]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        article_batch = [str(text) for text in article_batch]

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [6]:
pip install --upgrade datasets



In [7]:
df = pd.read_csv("/content/drive/MyDrive/clean_dataset.csv", encoding='latin1')

In [8]:
df.drop(columns=['Unnamed: 0','id'],axis=1, inplace=True)

In [9]:
df

Unnamed: 0,summary,text
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...
...,...,...
4391,Fruit juice concentrate maker Rasna is eyeing ...,"Mumbai, Feb 23 (PTI) Fruit juice concentrate m..."
4392,Former Indian cricketer Sachin Tendulkar atten...,Former cricketer Sachin Tendulkar was spotted ...
4393,"Aamir Khan, while talking about reality shows ...","Aamir Khan, whose last film Dangal told the st..."
4394,The Maharashtra government has initiated an in...,Maharahstra Power Minister Chandrashekhar Bawa...


In [10]:
pipe = pipeline('summarization', model = model_ckpt )

In [11]:
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=True, random_state=0)

In [12]:
test_data

Unnamed: 0,summary,text
3703,Israeli researchers have successfully tested a...,A team of researchers at an Israeli university...
1358,A woman has been caught for carrying a country...,A country-made pistol was found concealed in a...
2185,A baby girl was delivered by two male police o...,Delhi cops acting as midwives? As crazy as tha...
2379,The 93-year-old Zimbabwean President Robert Mu...,Zimbabwe?s President Robert Mugabe is in Singa...
2929,Instances of detecting counterfeit currency in...,"By Neelabh Srivastava New Delhi, Jun 14 (PTI) ..."
...,...,...
4385,A new poster of the upcoming epic historical f...,"Be it the visuals or its core, SS Rajamouli's ..."
1513,The Income Tax Department has unearthed over ?...,"operators, hawala dealers New Delhi, Jan 16 (P..."
2094,Kolkata-based bakery 'Krazy for Chocolates' ha...,Dessert lovers in Kolkata are in for a treat a...
1736,The regional transport authority has allowed t...,You don't need to spend a fortune on cabs ever...


In [13]:
#pipe_out = pipe(test_data['text'].iloc[0])

In [14]:
rouge_metric = load_metric('rouge')

  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [15]:
# score = calculate_metric_on_test_ds(test_data,
#                                     metric = rouge_metric,
#                                     model = model_bart,
#                                     tokenizer = tokenizer,
#                                     column_text = 'text',
#                                     column_summary='summary',
#                                     batch_size=8)

In [16]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
#rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

#pd.DataFrame(rouge_dict, index = ['facebook/bart'])

In [17]:
def convert_examples_to_features(example_batch):
    print(example_batch)
    input_encodings = tokenizer(example_batch['text'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True)
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [18]:
#BART_dataset = df.map(convert_examples_to_features, batched = True)

In [19]:
from datasets import Dataset

BART_dataset_train = Dataset.from_pandas(train_data)

BART_dataset_test = Dataset.from_pandas(test_data)

BART_dataset_train = BART_dataset_train.map(convert_examples_to_features, batched=True)

BART_dataset_test = BART_dataset_test.map(convert_examples_to_features, batched=True)

Map:   0%|          | 0/3516 [00:00<?, ? examples/s]



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)





Map:   0%|          | 0/880 [00:00<?, ? examples/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
#df = df.rename(columns={"sentence1" : "text", "sentence2" : "summary"})

In [21]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model_bart)

In [22]:
pip install accelerate -U



In [27]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='facebook/BART_fine_tune',
    num_train_epochs=3,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16
)

In [28]:
trainer = Trainer(model=model_bart, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=BART_dataset_train,
                  eval_dataset=BART_dataset_test)

In [29]:
pip install --upgrade datasets




In [30]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.9012,1.393264


TrainOutput(global_step=657, training_loss=1.2120736156968765, metrics={'train_runtime': 2753.3193, 'train_samples_per_second': 3.831, 'train_steps_per_second': 0.239, 'total_flos': 9446878782111744.0, 'train_loss': 1.2120736156968765, 'epoch': 2.99})

In [None]:
score = calculate_metric_on_test_ds(
    test_data, rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'text', column_summary= 'summary'
)


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'facebook/BART'] )

In [31]:
## Save model
model_bart.save_pretrained("facebook\BART-news-fine-tune-model")

In [32]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [33]:
import os

In [35]:
os.getcwd()

'/content'

In [36]:
!mv "/content/facebook" "/content/drive/MyDrive/NLP/"

In [37]:
!mv "/content/facebook\BART-news-fine-tune-model" "/content/drive/MyDrive/NLP/"

In [39]:
!mv "/content/tokenizer" "/content/drive/MyDrive/NLP/"

In [None]:
### Code to load the model

from transformers import BartForConditionalGeneration

# Define the path to the saved model
model_path = "/content/drive/MyDrive/NLP/facebook/BART-news-fine-tune-model"

# Load the model
model = BartForConditionalGeneration.from_pretrained(model_path)

# Now the model is loaded and ready to use

In [38]:
### Code to load the tokeniser

from transformers import BartTokenizer

# Specify the path to your saved tokenizer
tokenizer_path = "/content/drive/MyDrive/NLP/tokenizer"

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained(tokenizer_path)

# Now you can use the tokenizer for tokenization tasks