In [28]:
import pandas as pd
import random
import os
import evaluate
import accelerate 

In [29]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(torch.cuda.get_device_name(0))
else:
    print("No GPU detected. Training will run on the CPU.")

GPU is available!
NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [30]:
os.listdir()

['.vscode',
 'bbc tech news dataset for fine tuning',
 'final data.ipynb',
 'fine tuning llm.ipynb',
 'longt5-summarization',
 'model and tokeniser',
 'review text',
 'summarisation practice.ipynb',
 'VENV']

In [31]:
#output text (summarised)
summaryDir = "D:\\programming\\non library stuff\\Projects\\fine tuning\\bbc tech news dataset for fine tuning\\BBC News Summary\\Summaries\\tech"
#input text (before summarising)
newsArticleDir = "D:\\programming\\non library stuff\\Projects\\fine tuning\\bbc tech news dataset for fine tuning\\BBC News Summary\\News Articles\\tech"

In [32]:
from pathlib import Path
#list containing articles
articleText = []

for p in Path(newsArticleDir).glob('*.txt'):
    articleText.append(p.read_text())

In [33]:
summaryText = []

#list containing summary of articles
for p in Path(newsArticleDir).glob('*.txt'):
    summaryText.append(p.read_text())

In [34]:
#articleText

In [35]:
#summaryText

In [36]:
dict = {'Main article':articleText,'Summarised text':summaryText}

In [37]:
df = pd.DataFrame(dict)

In [38]:
df

Unnamed: 0,Main article,Summarised text
0,Ink helps drive democracy in Asia\n\nThe Kyrgy...,Ink helps drive democracy in Asia\n\nThe Kyrgy...
1,China net cafe culture crackdown\n\nChinese au...,China net cafe culture crackdown\n\nChinese au...
2,Microsoft seeking spyware trojan\n\nMicrosoft ...,Microsoft seeking spyware trojan\n\nMicrosoft ...
3,Digital guru floats sub-$100 PC\n\nNicholas Ne...,Digital guru floats sub-$100 PC\n\nNicholas Ne...
4,Technology gets the creative bug\n\nThe hi-tec...,Technology gets the creative bug\n\nThe hi-tec...
...,...,...
396,BT program to beat dialler scams\n\nBT is intr...,BT program to beat dialler scams\n\nBT is intr...
397,Spam e-mails tempt net shoppers\n\nComputer us...,Spam e-mails tempt net shoppers\n\nComputer us...
398,Be careful how you code\n\nA new European dire...,Be careful how you code\n\nA new European dire...
399,US cyber security chief resigns\n\nThe man mak...,US cyber security chief resigns\n\nThe man mak...


In [39]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration

model_name = "google/long-t5-tglobal-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
model = LongT5ForConditionalGeneration.from_pretrained(model_name)

In [40]:
y = df['Summarised text']
X = df['Main article']

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
    )

In [42]:
from datasets import Dataset

# Assuming X_train, y_train, X_test, y_test are lists or pandas Series
# with 'article' and 'summary' as column names
train_dataset = Dataset.from_dict({'article': X_train, 'summary': y_train})
eval_dataset = Dataset.from_dict({'article': X_test, 'summary': y_test})

print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['article', 'summary'],
    num_rows: 360
})
Dataset({
    features: ['article', 'summary'],
    num_rows: 41
})


In [43]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration

# Load the tokenizer. We need to do this step again in case your environment restarted.
model_name = "google/long-t5-tglobal-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# The prefix for the summarization task
prefix = "summarize: "

# Set the maximum lengths for your articles and summaries
# Long-T5 supports up to 16,384 tokens
max_input_length = 16384
max_target_length = 128

def tokenize_function(examples):
    # Add the prefix to the input text
    inputs = [prefix + doc for doc in examples['article']]
    
    # Tokenize the input articles, truncating if they exceed the max length
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input_length, 
        truncation=True
    )
    
    # Tokenize the labels (summaries)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['summary'], 
            max_length=max_target_length, 
            truncation=True
        )

    # Add the tokenized labels to the inputs
    model_inputs['labels'] = labels['input_ids']
    
    return model_inputs

# Apply the tokenization function to your training and evaluation datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# These are the datasets you will use for fine-tuning
print(tokenized_train_dataset)
print(tokenized_eval_dataset)

Map: 100%|██████████| 360/360 [00:00<00:00, 1125.96 examples/s]
Map: 100%|██████████| 41/41 [00:00<00:00, 899.14 examples/s]

Dataset({
    features: ['article', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 360
})
Dataset({
    features: ['article', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 41
})





In [50]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq

# This is the correct way to define TrainingArguments
training_args = TrainingArguments(
    output_dir='./longt5-summarization',
    num_train_epochs=3,
    per_device_train_batch_size=1,            # Set to 1
    gradient_accumulation_steps=4,            # Add this to simulate a larger batch
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,                                # Add this to enable mixed precision
    gradient_checkpointing=True, 
)

In [51]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Trainer(
    model=model,                           # The Long-T5 model you loaded
    args=training_args,                    # The training arguments you just defined
    train_dataset=tokenized_train_dataset, # Your tokenized training dataset
    eval_dataset=tokenized_eval_dataset,   # Your tokenized evaluation dataset
    data_collator=data_collator
)

In [52]:
# Step 3: Start the training!
trainer.train()

print("Training is complete! Your fine-tuned model is saved in the './longt5-summarization' directory.")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,4.2832
20,20.8127
30,60.7636
40,26.3335
50,24.0975
60,129.3088
70,11.1247
80,1.5859
90,43.028
100,5.5821


Training is complete! Your fine-tuned model is saved in the './longt5-summarization' directory.
