# 0. Imports, Read in data and BART

In [None]:
!pip install datasets rouge-score

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
!pip install transformers matplotlib torch

In [5]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.3.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Using cached scipy-1.11.4-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Using cached scikit_learn-1.3.2-cp310-cp310-win_amd64.whl (9.3 MB)
Using cached scipy-1.11.4-cp310-cp310-win_amd64.whl (44.1 MB)
Using cached threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.3.2 scipy-1.11.4 threadpoolctl-3.2.0


In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, load_metric
from huggingface_hub import Repository
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from torch.utils.data import Dataset
from rouge_score import rouge_scorer

#import os
#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [9]:
# Load the fine-tuning dataset
df = pd.read_json('dataset.json', lines=True)

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [5]:
train_df

Unnamed: 0,plain_text,summary
363,Last updated: February 2020 YOU AFFIRM THAT Y...,There is a date of the last update of the term...
596,Refsnes Data offers the website (w3schools.com...,This service is a subsidiary of Refsnes Data. ...
772,Update to our Privacy Policy We have recently...,There is a date of the last update of the agre...
298,"breach or circumvent any laws post false, inac...",Users agree not to use the service for illegal...
172,If you have general questions about your accou...,The service provides a free help desk. The ser...
...,...,...
106,Disabling cookies may affect your use of some ...,Blocking cookies may limit your ability to use...
270,Use of the Site and Services is limited to Aut...,This service is only available to users of a c...
860,Use License Permission is granted to temporari...,This service is only available for use individ...
435,End-users may opt out of being informed via pr...,You can opt out of promotional communications....


In [10]:
test_df

Unnamed: 0,plain_text,summary
70,"Users may, however, visit our Site anonymousl...",Users can access most of the pages on the serv...
235,"Effective January 1, 2020 What information we...",There is a date of the last update of the agre...
430,"We may also use web beacons, flash cookies, an...","The service may use tracking pixels, web beaco..."
412,How we collect personal data: Computer IP addr...,The service provides information about how the...
39,"Brilliant Worldwide, Inc. (""Brilliant"") knows...",This service offers a symbolic but nonbinding ...
...,...,...
549,This Privacy Policy was last modified as of Ap...,There is a date of the last update of the agre...
351,"From time to time, we may change these Terms a...","The service may change its terms at any time, ..."
720,Effective Date: 06/15/2021 This Privacy Polic...,There is a date of the last update of the agre...
714,"We are located in the EU, and we fully comply ...",The service claims to be GDPR compliant for Eu...


In [11]:
print("Sample Terms of Service text:\n", train_df["plain_text"][0])

print("")

print("Its summary:\n", train_df["summary"][0])

Sample Terms of Service text:
 We can change these Terms at any time. We keep a historical record of all changes to our Terms on GitHub. If a change is material, we’ll let you know before they take effect. By using Medium on or after that effective date, you agree to the new Terms. If you don’t agree to them, you should delete your account before they take effect, otherwise your use of the site and content will be subject to the new Terms. You own the rights to the content you create and post on Medium. We will never sell your content to third parties without your explicit permission. You may need to register for an account to access some or all of our Services. We can remove any content you post for any reason. You can delete any of your posts, or your account, anytime. Processing the deletion may take a little time, but we’ll do it as quickly as possible. We may keep backup copies of your deleted post or account on our servers for up to 14 days after you delete it. Medium reserves th

# 1. Fine-Tuning of BART

In [12]:
# Function to compute metrics
def compute_metrics(eval_pred):
    predictions, references = eval_pred
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

    scores = []
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores.append(score)

    # Aggregate scores, for example, by computing average
    aggregated_scores = {
        'rouge-1': sum([score['rouge1'].fmeasure for score in scores]) / len(scores),
        'rouge-2': sum([score['rouge2'].fmeasure for score in scores]) / len(scores),
        'rouge-L': sum([score['rougeL'].fmeasure for score in scores]) / len(scores)
    }

    return aggregated_scores

In [13]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, df, max_length=512, max_target_length=128):
        self.input_ids = []
        self.attention_mask = []
        self.labels = []

        for _, row in df.iterrows():
            inputs = tokenizer.encode_plus(row['plain_text'], max_length=max_length, return_tensors='pt', truncation=True)
            labels = tokenizer.encode_plus(row['summary'], max_length=max_target_length, return_tensors='pt', truncation=True)

            self.input_ids.append(inputs.input_ids.flatten())
            self.attention_mask.append(inputs.attention_mask.flatten())
            self.labels.append(labels.input_ids.flatten())

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_mask[idx],
                'labels': self.labels[idx]}

In [19]:
torch.cuda.empty_cache()
#torch.cuda.memory_summary()

In [15]:
# Load BART pre-trained model and tokenizer
#tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
#model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Create datasets
train_dataset = CustomDataset(tokenizer, train_df)
test_dataset = CustomDataset(tokenizer, test_df)

# Define training configurations
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_steps=100,
    save_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training using the Trainer instance
trainer.train()

  0%|          | 0/948 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/948 [01:31<24:09:43, 91.85s/it]

In [None]:
model.save_pretrained("./ToS_finetuned_bart")
tokenizer.save_pretrained("./ToS_finetuned_bart")

('./fine-tuned-bart/tokenizer_config.json',
 './fine-tuned-bart/special_tokens_map.json',
 './fine-tuned-bart/vocab.json',
 './fine-tuned-bart/merges.txt',
 './fine-tuned-bart/added_tokens.json')

# 3. Evaluate Fine-tuned Model

# 4. Push model to huggingface

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("BART-ToS-Summarization")

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/EE21/BART-ToS-Summarization/commit/e496c860fe58aff8ef0559694d0dc8994c4896cc', commit_message='Upload BartForConditionalGeneration', commit_description='', oid='e496c860fe58aff8ef0559694d0dc8994c4896cc', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("BART-ToS-Summarization")

CommitInfo(commit_url='https://huggingface.co/EE21/BART-ToS-Summarization/commit/da7e6fefdd7873b0293f10dd8e827356b927b65f', commit_message='Upload tokenizer', commit_description='', oid='da7e6fefdd7873b0293f10dd8e827356b927b65f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/EE21/results/tree/main/'

---

In [None]:
train_df["summary"][0]

"Users should revisit the terms periodically, although in case of material changes, the service will notify. You maintain ownership of your data (the content you create and post on Medium). This service does not sell your personal data. You must create an account to use this service. The service can delete specific content without prior notice and without a reason. You can delete your content from this service. User accounts can be terminated after having been in breach of the terms of service repeatedly. Users cannot scrape the website of the service. This service is only available only for people 13 years old and over. The service has non-exclusive use of your content. The service is provided 'as is' and to be used at the users' sole risk. This service assumes no liability for any losses or damages resulting from any matter relating to the service. The court of law governing the terms is in San Francisco, California. Your account can be deleted without prior notice and without a reas

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_name = "EE21/BART-ToS-Summarization"  # Replace with the actual path to your model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define input text for prediction
input_text = train_df["plain_text"][0]

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512)

# Generate a summary
output_ids = model.generate(input_ids, max_length=128, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode the generated summary
generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print or use the generated summary
print("Generated Summary:", generated_summary)

config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Generated Summary:  Terms may be changed any time at their discretion, without notice to the user. You maintain ownership of your data. This service does not sell your personal data. You must create an account to use this service. The service can delete specific content without reason and may do it without prior notice. You have the right to leave this service at any time. User accounts can be terminated after having been in breach of the terms of service repeatedly. Spidering, crawling, or accessing the site through any automated means is not allowed. Only for people 13 years old and over. If you are the target of a copyright holder's take down
